Patched in Tegra support.

author: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-17 16:15:55 -0500
committer: Jonathan Herman <hermanjl@cs.unc.edu> 2013-01-17 16:15:55 -0500
commit: 8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
tree: a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /kernel
parent: 406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
227 files changed, 15804 insertions, 53963 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 44511d100ea..5068e2a4e75 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -87,9 +87,6 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQ
 config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
        bool
-config UNINLINE_SPIN_UNLOCK
-        bool
 #
 # lock_* functions are inlined when:
 #   - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
@@ -106,120 +103,100 @@ config UNINLINE_SPIN_UNLOCK
 #   - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
 #
-if !DEBUG_SPINLOCK
 config INLINE_SPIN_TRYLOCK
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
-        depends on ARCH_INLINE_SPIN_TRYLOCK
 config INLINE_SPIN_TRYLOCK_BH
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
-        depends on ARCH_INLINE_SPIN_TRYLOCK_BH
 config INLINE_SPIN_LOCK
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
-        depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
 config INLINE_SPIN_LOCK_BH
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
-        depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH
+                 ARCH_INLINE_SPIN_LOCK_BH
 config INLINE_SPIN_LOCK_IRQ
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
-        depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ
+                 ARCH_INLINE_SPIN_LOCK_IRQ
 config INLINE_SPIN_LOCK_IRQSAVE
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
-        depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE
+                 ARCH_INLINE_SPIN_LOCK_IRQSAVE
+config INLINE_SPIN_UNLOCK
+        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
 config INLINE_SPIN_UNLOCK_BH
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
-        depends on ARCH_INLINE_SPIN_UNLOCK_BH
 config INLINE_SPIN_UNLOCK_IRQ
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
-        depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH
 config INLINE_SPIN_UNLOCK_IRQRESTORE
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
-        depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
 config INLINE_READ_TRYLOCK
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
-        depends on ARCH_INLINE_READ_TRYLOCK
 config INLINE_READ_LOCK
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
-        depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
 config INLINE_READ_LOCK_BH
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
-        depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH
+                 ARCH_INLINE_READ_LOCK_BH
 config INLINE_READ_LOCK_IRQ
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
-        depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ
+                 ARCH_INLINE_READ_LOCK_IRQ
 config INLINE_READ_LOCK_IRQSAVE
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
-        depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE
+                 ARCH_INLINE_READ_LOCK_IRQSAVE
 config INLINE_READ_UNLOCK
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
-        depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
 config INLINE_READ_UNLOCK_BH
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
-        depends on ARCH_INLINE_READ_UNLOCK_BH
 config INLINE_READ_UNLOCK_IRQ
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
-        depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH
 config INLINE_READ_UNLOCK_IRQRESTORE
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
-        depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE
 config INLINE_WRITE_TRYLOCK
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
-        depends on ARCH_INLINE_WRITE_TRYLOCK
 config INLINE_WRITE_LOCK
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
-        depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
 config INLINE_WRITE_LOCK_BH
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
-        depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH
+                 ARCH_INLINE_WRITE_LOCK_BH
 config INLINE_WRITE_LOCK_IRQ
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
-        depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ
+                 ARCH_INLINE_WRITE_LOCK_IRQ
 config INLINE_WRITE_LOCK_IRQSAVE
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
-        depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE
+                 ARCH_INLINE_WRITE_LOCK_IRQSAVE
 config INLINE_WRITE_UNLOCK
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
-        depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
 config INLINE_WRITE_UNLOCK_BH
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
-        depends on ARCH_INLINE_WRITE_UNLOCK_BH
 config INLINE_WRITE_UNLOCK_IRQ
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
-        depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH
 config INLINE_WRITE_UNLOCK_IRQRESTORE
-        def_bool y
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
-        depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
-endif
 config MUTEX_SPIN_ON_OWNER
-        def_bool y
+        def_bool SMP && !DEBUG_MUTEXES
-        depends on SMP && !DEBUG_MUTEXES
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 3f9c97419f0..24e7cb0ba26 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -36,7 +36,6 @@ config PREEMPT_VOLUNTARY
 config PREEMPT
        bool "Preemptible Kernel (Low-Latency Desktop)"
        select PREEMPT_COUNT
-        select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
        help
          This option reduces the latency of the kernel by making
          all kernel code (that is not executing in a critical section)
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c072b6da23..eca595e2fd5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,15 +2,16 @@
 # Makefile for the linux kernel.
 #
-obj-y     = fork.o exec_domain.o panic.o printk.o \
+obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            cpu.o exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
-            signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
+            signal.o sys.o kmod.o workqueue.o pid.o \
            rcupdate.o extable.o params.o posix-timers.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-            notifier.o ksysfs.o cred.o \
+            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-            async.o range.o groups.o lglock.o smpboot.o
+            async.o range.o
+obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
@@ -19,17 +20,13 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg
 CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
+CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_irq_work.o = -pg
 endif
-obj-y += sched/
-obj-y += power/
-ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
-obj-$(CONFIG_X86) += kcmp.o
-endif
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
+obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
@@ -54,8 +51,9 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_PM) += power/
+obj-$(CONFIG_FREEZER) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
@@ -98,11 +96,11 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
 obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
-obj-$(CONFIG_TRACE_CLOCK) += trace/
+obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
+obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
-obj-$(CONFIG_CPU_PM) += cpu_pm.o
 obj-$(CONFIG_PERF_EVENTS) += events/
@@ -110,7 +108,15 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
-obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
+# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+# needed for x86 only.  Why this used to be enabled for all architectures is beyond
+# me.  I suspect most platforms don't need this, but until we know that for sure
+# I turn this off for IA-64 only.  Andreas Schwab says it's also needed on m68k
+# to get a correct value for the wait-channel (WCHAN in ps). --davidm
+CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
+endif
 $(obj)/configs.o: $(obj)/config_data.h
@@ -132,81 +138,3 @@ quiet_cmd_timeconst  = TIMEC   $@
 targets += timeconst.h
 $(obj)/timeconst.h: $(src)/timeconst.pl FORCE
        $(call if_changed,timeconst)
-ifeq ($(CONFIG_MODULE_SIG),y)
-#
-# Pull the signing certificate and any extra certificates into the kernel
-#
-quiet_cmd_touch = TOUCH   $@
-      cmd_touch = touch   $@
-extra_certificates:
-        $(call cmd,touch)
-kernel/modsign_certificate.o: signing_key.x509 extra_certificates
-###############################################################################
-#
-# If module signing is requested, say by allyesconfig, but a key has not been
-# supplied, then one will need to be generated to make sure the build does not
-# fail and that the kernel may be used afterwards.
-#
-###############################################################################
-sign_key_with_hash :=
-ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
-sign_key_with_hash := -sha1
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
-sign_key_with_hash := -sha224
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
-sign_key_with_hash := -sha256
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
-sign_key_with_hash := -sha384
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
-sign_key_with_hash := -sha512
-endif
-ifeq ($(sign_key_with_hash),)
-$(error Could not determine digest type to use from kernel config)
-endif
-signing_key.priv signing_key.x509: x509.genkey
-        @echo "###"
-        @echo "### Now generating an X.509 key pair to be used for signing modules."
-        @echo "###"
-        @echo "### If this takes a long time, you might wish to run rngd in the"
-        @echo "### background to keep the supply of entropy topped up.  It"
-        @echo "### needs to be run as root, and uses a hardware random"
-        @echo "### number generator if one is available."
-        @echo "###"
-        openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \
-                -x509 -config x509.genkey \
-                -outform DER -out signing_key.x509 \
-                -keyout signing_key.priv
-        @echo "###"
-        @echo "### Key pair generated."
-        @echo "###"
-x509.genkey:
-        @echo Generating X.509 key generation config
-        @echo  >x509.genkey "[ req ]"
-        @echo >>x509.genkey "default_bits = 4096"
-        @echo >>x509.genkey "distinguished_name = req_distinguished_name"
-        @echo >>x509.genkey "prompt = no"
-        @echo >>x509.genkey "string_mask = utf8only"
-        @echo >>x509.genkey "x509_extensions = myexts"
-        @echo >>x509.genkey
-        @echo >>x509.genkey "[ req_distinguished_name ]"
-        @echo >>x509.genkey "O = Magrathea"
-        @echo >>x509.genkey "CN = Glacier signing key"
-        @echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
-        @echo >>x509.genkey
-        @echo >>x509.genkey "[ myexts ]"
-        @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
-        @echo >>x509.genkey "keyUsage=digitalSignature"
-        @echo >>x509.genkey "subjectKeyIdentifier=hash"
-        @echo >>x509.genkey "authorityKeyIdentifier=keyid"
-endif
diff --git a/kernel/acct.c b/kernel/acct.c
index 051e071a06e..fa7eb3de2dd 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -84,10 +84,11 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 * the cache line to have the data after getting the lock.
 */
 struct bsd_acct_struct {
-        int                     active;
+        volatile int            active;
-        unsigned long           needcheck;
+        volatile int            needcheck;
        struct file             *file;
        struct pid_namespace    *ns;
+        struct timer_list       timer;
        struct list_head        list;
 };
@@ -95,6 +96,15 @@ static DEFINE_SPINLOCK(acct_lock);
 static LIST_HEAD(acct_list);
 /*
+ * Called whenever the timer says to check the free space.
+ */
+static void acct_timeout(unsigned long x)
+{
+        struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
+        acct->needcheck = 1;
+}
+/*
 * Check the amount of free space and suspend/resume accordingly.
 */
 static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
@@ -102,12 +112,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
        struct kstatfs sbuf;
        int res;
        int act;
-        u64 resume;
+        sector_t resume;
-        u64 suspend;
+        sector_t suspend;
        spin_lock(&acct_lock);
        res = acct->active;
-        if (!file || time_is_before_jiffies(acct->needcheck))
+        if (!file || !acct->needcheck)
                goto out;
        spin_unlock(&acct_lock);
@@ -117,8 +127,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
        suspend = sbuf.f_blocks * SUSPEND;
        resume = sbuf.f_blocks * RESUME;
-        do_div(suspend, 100);
+        sector_div(suspend, 100);
-        do_div(resume, 100);
+        sector_div(resume, 100);
        if (sbuf.f_bavail <= suspend)
                act = -1;
@@ -150,7 +160,10 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
                }
        }
-        acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
+        del_timer(&acct->timer);
+        acct->needcheck = 0;
+        acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+        add_timer(&acct->timer);
        res = acct->active;
 out:
        spin_unlock(&acct_lock);
@@ -172,7 +185,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
        if (acct->file) {
                old_acct = acct->file;
                old_ns = acct->ns;
+                del_timer(&acct->timer);
                acct->active = 0;
+                acct->needcheck = 0;
                acct->file = NULL;
                acct->ns = NULL;
                list_del(&acct->list);
@@ -180,9 +195,13 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
        if (file) {
                acct->file = file;
                acct->ns = ns;
-                acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
+                acct->needcheck = 0;
                acct->active = 1;
                list_add(&acct->list, &acct_list);
+                /* It's been deleted if it was used before so this is safe */
+                setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
+                acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+                add_timer(&acct->timer);
        }
        if (old_acct) {
                mnt_unpin(old_acct->f_path.mnt);
@@ -193,7 +212,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
        }
 }
-static int acct_on(struct filename *pathname)
+static int acct_on(char *name)
 {
        struct file *file;
        struct vfsmount *mnt;
@@ -201,7 +220,7 @@ static int acct_on(struct filename *pathname)
        struct bsd_acct_struct *acct = NULL;
        /* Difference from BSD - they don't do O_APPEND */
-        file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+        file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
        if (IS_ERR(file))
                return PTR_ERR(file);
@@ -260,7 +279,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
                return -EPERM;
        if (name) {
-                struct filename *tmp = getname(name);
+                char *tmp = getname(name);
                if (IS_ERR(tmp))
                        return (PTR_ERR(tmp));
                error = acct_on(tmp);
@@ -315,7 +334,7 @@ void acct_auto_close(struct super_block *sb)
        spin_lock(&acct_lock);
 restart:
        list_for_each_entry(acct, &acct_list, list)
-                if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
+                if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
                        acct_file_reopen(acct, NULL, NULL);
                        goto restart;
                }
@@ -329,6 +348,7 @@ void acct_exit_ns(struct pid_namespace *ns)
        if (acct == NULL)
                return;
+        del_timer_sync(&acct->timer);
        spin_lock(&acct_lock);
        if (acct->file != NULL)
                acct_file_reopen(acct, NULL, NULL);
@@ -478,7 +498,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
         * Fill the accounting struct with the needed info as recorded
         * by the different kernel functions.
         */
-        memset(&ac, 0, sizeof(acct_t));
+        memset((caddr_t)&ac, 0, sizeof(acct_t));
        ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
        strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
@@ -507,8 +527,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
        do_div(elapsed, AHZ);
        ac.ac_btime = get_seconds() - elapsed;
        /* we really need to bite the bullet and change layout */
-        ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
+        ac.ac_uid = orig_cred->uid;
-        ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
+        ac.ac_gid = orig_cred->gid;
 #if ACCT_VERSION==2
        ac.ac_ahz = AHZ;
 #endif
@@ -593,8 +613,8 @@ void acct_collect(long exitcode, int group_dead)
                pacct->ac_flag |= ACORE;
        if (current->flags & PF_SIGNALED)
                pacct->ac_flag |= AXSIG;
-        pacct->ac_utime += current->utime;
+        pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
-        pacct->ac_stime += current->stime;
+        pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
        pacct->ac_minflt += current->min_flt;
        pacct->ac_majflt += current->maj_flt;
        spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 9d311838485..d5fe7af0de2 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -51,7 +51,7 @@ asynchronous and synchronous parts of the kernel.
 #include <linux/async.h>
 #include <linux/atomic.h>
 #include <linux/ktime.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -62,10 +62,8 @@ static async_cookie_t next_cookie = 1;
 #define MAX_WORK        32768
 static LIST_HEAD(async_pending);
-static ASYNC_DOMAIN(async_running);
+static LIST_HEAD(async_running);
-static LIST_HEAD(async_domains);
 static DEFINE_SPINLOCK(async_lock);
-static DEFINE_MUTEX(async_register_mutex);
 struct async_entry {
        struct list_head        list;
@@ -73,23 +71,26 @@ struct async_entry {
        async_cookie_t          cookie;
        async_func_ptr          *func;
        void                    *data;
-        struct async_domain     *running;
+        struct list_head        *running;
 };
 static DECLARE_WAIT_QUEUE_HEAD(async_done);
 static atomic_t entry_count;
+extern int initcall_debug;
 /*
 * MUST be called with the lock held!
 */
-static async_cookie_t  __lowest_in_progress(struct async_domain *running)
+static async_cookie_t  __lowest_in_progress(struct list_head *running)
 {
        struct async_entry *entry;
-        if (!list_empty(&running->domain)) {
+        if (!list_empty(running)) {
-                entry = list_first_entry(&running->domain, typeof(*entry), list);
+                entry = list_first_entry(running,
+                        struct async_entry, list);
                return entry->cookie;
        }
@@ -100,7 +101,7 @@ static async_cookie_t  __lowest_in_progress(struct async_domain *running)
        return next_cookie;     /* "infinity" value */
 }
-static async_cookie_t  lowest_in_progress(struct async_domain *running)
+static async_cookie_t  lowest_in_progress(struct list_head *running)
 {
        unsigned long flags;
        async_cookie_t ret;
@@ -119,12 +120,11 @@ static void async_run_entry_fn(struct work_struct *work)
        struct async_entry *entry =
                container_of(work, struct async_entry, work);
        unsigned long flags;
-        ktime_t uninitialized_var(calltime), delta, rettime;
+        ktime_t calltime, delta, rettime;
-        struct async_domain *running = entry->running;
        /* 1) move self to the running queue */
        spin_lock_irqsave(&async_lock, flags);
-        list_move_tail(&entry->list, &running->domain);
+        list_move_tail(&entry->list, entry->running);
        spin_unlock_irqrestore(&async_lock, flags);
        /* 2) run (and print duration) */
@@ -147,8 +147,6 @@ static void async_run_entry_fn(struct work_struct *work)
        /* 3) remove self from the running queue */
        spin_lock_irqsave(&async_lock, flags);
        list_del(&entry->list);
-        if (running->registered && --running->count == 0)
-                list_del_init(&running->node);
        /* 4) free the entry */
        kfree(entry);
@@ -160,7 +158,7 @@ static void async_run_entry_fn(struct work_struct *work)
        wake_up(&async_done);
 }
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
+static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
 {
        struct async_entry *entry;
        unsigned long flags;
@@ -191,8 +189,6 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
        spin_lock_irqsave(&async_lock, flags);
        newcookie = entry->cookie = next_cookie++;
        list_add_tail(&entry->list, &async_pending);
-        if (running->registered && running->count++ == 0)
-                list_add_tail(&running->node, &async_domains);
        atomic_inc(&entry_count);
        spin_unlock_irqrestore(&async_lock, flags);
@@ -229,7 +225,7 @@ EXPORT_SYMBOL_GPL(async_schedule);
 * Note: This function may be called from atomic or non-atomic contexts.
 */
 async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
-                                     struct async_domain *running)
+                                     struct list_head *running)
 {
        return __async_schedule(ptr, data, running);
 }
@@ -242,52 +238,22 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
 */
 void async_synchronize_full(void)
 {
-        mutex_lock(&async_register_mutex);
        do {
-                struct async_domain *domain = NULL;
+                async_synchronize_cookie(next_cookie);
+        } while (!list_empty(&async_running) || !list_empty(&async_pending));
-                spin_lock_irq(&async_lock);
-                if (!list_empty(&async_domains))
-                        domain = list_first_entry(&async_domains, typeof(*domain), node);
-                spin_unlock_irq(&async_lock);
-                async_synchronize_cookie_domain(next_cookie, domain);
-        } while (!list_empty(&async_domains));
-        mutex_unlock(&async_register_mutex);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 /**
- * async_unregister_domain - ensure no more anonymous waiters on this domain
- * @domain: idle domain to flush out of any async_synchronize_full instances
- *
- * async_synchronize_{cookie|full}_domain() are not flushed since callers
- * of these routines should know the lifetime of @domain
- *
- * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
- */
-void async_unregister_domain(struct async_domain *domain)
-{
-        mutex_lock(&async_register_mutex);
-        spin_lock_irq(&async_lock);
-        WARN_ON(!domain->registered || !list_empty(&domain->node) ||
-                !list_empty(&domain->domain));
-        domain->registered = 0;
-        spin_unlock_irq(&async_lock);
-        mutex_unlock(&async_register_mutex);
-}
-EXPORT_SYMBOL_GPL(async_unregister_domain);
-/**
 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @domain: running list to synchronize on
+ * @list: running list to synchronize on
 *
 * This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @domain have been done.
+ * synchronization domain specified by the running list @list have been done.
 */
-void async_synchronize_full_domain(struct async_domain *domain)
+void async_synchronize_full_domain(struct list_head *list)
 {
-        async_synchronize_cookie_domain(next_cookie, domain);
+        async_synchronize_cookie_domain(next_cookie, list);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
@@ -297,15 +263,13 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
 * @running: running list to synchronize on
 *
 * This function waits until all asynchronous function calls for the
- * synchronization domain specified by running list @running submitted
+ * synchronization domain specified by the running list @list submitted
 * prior to @cookie have been done.
 */
-void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie,
+                                     struct list_head *running)
 {
-        ktime_t uninitialized_var(starttime), delta, endtime;
+        ktime_t starttime, delta, endtime;
-        if (!running)
-                return;
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
diff --git a/kernel/audit.c b/kernel/audit.c
index d596e5355f1..0a1355ca3d7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -45,7 +45,7 @@
 #include <asm/types.h>
 #include <linux/atomic.h>
 #include <linux/mm.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/kthread.h>
@@ -61,7 +61,6 @@
 #include <linux/netlink.h>
 #include <linux/freezer.h>
 #include <linux/tty.h>
-#include <linux/pid_namespace.h>
 #include "audit.h"
@@ -88,11 +87,11 @@ static int	audit_failure = AUDIT_FAIL_PRINTK;
 /*
 * If audit records are to be written to the netlink socket, audit_pid
- * contains the pid of the auditd process and audit_nlk_portid contains
+ * contains the pid of the auditd process and audit_nlk_pid contains
- * the portid to use to send netlink messages to that process.
+ * the pid to use to send netlink messages to that process.
 */
 int             audit_pid;
-static int      audit_nlk_portid;
+static int      audit_nlk_pid;
 /* If audit_rate_limit is non-zero, limit the rate of sending audit records
 * to that number per second.  This prevents DoS attacks, but results in
@@ -105,7 +104,7 @@ static int	audit_backlog_wait_time = 60 * HZ;
 static int      audit_backlog_wait_overflow = 0;
 /* The identity of the user shutting down the audit system. */
-kuid_t          audit_sig_uid = INVALID_UID;
+uid_t           audit_sig_uid = -1;
 pid_t           audit_sig_pid = -1;
 u32             audit_sig_sid = 0;
@@ -265,17 +264,15 @@ void audit_log_lost(const char *message)
 }
 static int audit_log_config_change(char *function_name, int new, int old,
-                                   kuid_t loginuid, u32 sessionid, u32 sid,
+                                   uid_t loginuid, u32 sessionid, u32 sid,
                                   int allow_changes)
 {
        struct audit_buffer *ab;
        int rc = 0;
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
-        if (unlikely(!ab))
-                return rc;
        audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
-                         old, from_kuid(&init_user_ns, loginuid), sessionid);
+                         old, loginuid, sessionid);
        if (sid) {
                char *ctx = NULL;
                u32 len;
@@ -295,7 +292,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
 }
 static int audit_do_config_change(char *function_name, int *to_change,
-                                  int new, kuid_t loginuid, u32 sessionid,
+                                  int new, uid_t loginuid, u32 sessionid,
                                  u32 sid)
 {
        int allow_changes, rc = 0, old = *to_change;
@@ -322,21 +319,21 @@ static int audit_do_config_change(char *function_name, int *to_change,
        return rc;
 }
-static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid,
+static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
                                u32 sid)
 {
        return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
                                      limit, loginuid, sessionid, sid);
 }
-static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid,
+static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
                                   u32 sid)
 {
        return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
                                      limit, loginuid, sessionid, sid);
 }
-static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
 {
        int rc;
        if (state < AUDIT_OFF || state > AUDIT_LOCKED)
@@ -351,7 +348,7 @@ static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid)
        return rc;
 }
-static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
 {
        if (state != AUDIT_FAIL_SILENT
            && state != AUDIT_FAIL_PRINTK
@@ -387,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb)
 static void audit_printk_skb(struct sk_buff *skb)
 {
        struct nlmsghdr *nlh = nlmsg_hdr(skb);
-        char *data = nlmsg_data(nlh);
+        char *data = NLMSG_DATA(nlh);
        if (nlh->nlmsg_type != AUDIT_EOE) {
                if (printk_ratelimit())
@@ -404,7 +401,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
        int err;
        /* take a reference in case we can't send it and we want to hold it */
        skb_get(skb);
-        err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
+        err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
        if (err < 0) {
                BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
                printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
@@ -470,6 +467,24 @@ static int kauditd_thread(void *dummy)
        return 0;
 }
+static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
+{
+        struct task_struct *tsk;
+        int err;
+        rcu_read_lock();
+        tsk = find_task_by_vpid(pid);
+        if (!tsk) {
+                rcu_read_unlock();
+                return -ESRCH;
+        }
+        get_task_struct(tsk);
+        rcu_read_unlock();
+        err = tty_audit_push_task(tsk, loginuid, sessionid);
+        put_task_struct(tsk);
+        return err;
+}
 int audit_send_list(void *_dest)
 {
        struct audit_netlink_list *dest = _dest;
@@ -501,15 +516,14 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
        if (!skb)
                return NULL;
-        nlh     = nlmsg_put(skb, pid, seq, t, size, flags);
+        nlh     = NLMSG_NEW(skb, pid, seq, t, size, flags);
-        if (!nlh)
+        data    = NLMSG_DATA(nlh);
-                goto out_kfree_skb;
-        data = nlmsg_data(nlh);
        memcpy(data, payload, size);
        return skb;
-out_kfree_skb:
+nlmsg_failure:                  /* Used by NLMSG_NEW */
-        kfree_skb(skb);
+        if (skb)
+                kfree_skb(skb);
        return NULL;
 }
@@ -573,11 +587,6 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 {
        int err = 0;
-        /* Only support the initial namespaces for now. */
-        if ((current_user_ns() != &init_user_ns) ||
-            (task_active_pid_ns(current) != &init_pid_ns))
-                return -EPERM;
        switch (msg_type) {
        case AUDIT_GET:
        case AUDIT_LIST:
@@ -592,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
        case AUDIT_TTY_SET:
        case AUDIT_TRIM:
        case AUDIT_MAKE_EQUIV:
-                if (!capable(CAP_AUDIT_CONTROL))
+                if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
                        err = -EPERM;
                break;
        case AUDIT_USER:
        case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
        case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
-                if (!capable(CAP_AUDIT_WRITE))
+                if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
                        err = -EPERM;
                break;
        default:  /* bad msg */
@@ -609,7 +618,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 }
 static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
-                                     kuid_t auid, u32 ses, u32 sid)
+                                     u32 pid, u32 uid, uid_t auid, u32 ses,
+                                     u32 sid)
 {
        int rc = 0;
        char *ctx = NULL;
@@ -621,12 +631,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
        }
        *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
-        if (unlikely(!*ab))
+        audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
-                return rc;
+                         pid, uid, auid, ses);
-        audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
-                         task_tgid_vnr(current),
-                         from_kuid(&init_user_ns, current_uid()),
-                         from_kuid(&init_user_ns, auid), ses);
        if (sid) {
                rc = security_secid_to_secctx(sid, &ctx, &len);
                if (rc)
@@ -642,13 +648,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
 static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
-        u32                     seq, sid;
+        u32                     uid, pid, seq, sid;
        void                    *data;
        struct audit_status     *status_get, status_set;
        int                     err;
        struct audit_buffer     *ab;
        u16                     msg_type = nlh->nlmsg_type;
-        kuid_t                  loginuid; /* loginuid of sender */
+        uid_t                   loginuid; /* loginuid of sender */
        u32                     sessionid;
        struct audit_sig_info   *sig_data;
        char                    *ctx = NULL;
@@ -668,11 +674,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                return err;
        }
+        pid  = NETLINK_CREDS(skb)->pid;
+        uid  = NETLINK_CREDS(skb)->uid;
        loginuid = audit_get_loginuid(current);
        sessionid = audit_get_sessionid(current);
        security_task_getsecid(current, &sid);
        seq  = nlh->nlmsg_seq;
-        data = nlmsg_data(nlh);
+        data = NLMSG_DATA(nlh);
        switch (msg_type) {
        case AUDIT_GET:
@@ -683,7 +691,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                status_set.backlog_limit = audit_backlog_limit;
                status_set.lost          = atomic_read(&audit_lost);
                status_set.backlog       = skb_queue_len(&audit_skb_queue);
-                audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
+                audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
                                 &status_set, sizeof(status_set));
                break;
        case AUDIT_SET:
@@ -711,7 +719,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                                        sessionid, sid, 1);
                        audit_pid = new_pid;
-                        audit_nlk_portid = NETLINK_CB(skb).portid;
+                        audit_nlk_pid = NETLINK_CB(skb).pid;
                }
                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
                        err = audit_set_rate_limit(status_get->rate_limit,
@@ -729,16 +737,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (!audit_enabled && msg_type != AUDIT_USER_AVC)
                        return 0;
-                err = audit_filter_user();
+                err = audit_filter_user(&NETLINK_CB(skb));
                if (err == 1) {
                        err = 0;
                        if (msg_type == AUDIT_USER_TTY) {
-                                err = tty_audit_push_task(current, loginuid,
+                                err = audit_prepare_user_tty(pid, loginuid,
                                                             sessionid);
                                if (err)
                                        break;
                        }
-                        audit_log_common_recv_msg(&ab, msg_type,
+                        audit_log_common_recv_msg(&ab, msg_type, pid, uid,
                                                  loginuid, sessionid, sid);
                        if (msg_type != AUDIT_USER_TTY)
@@ -754,7 +762,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                        size--;
                                audit_log_n_untrustedstring(ab, data, size);
                        }
-                        audit_set_pid(ab, NETLINK_CB(skb).portid);
+                        audit_set_pid(ab, pid);
                        audit_log_end(ab);
                }
                break;
@@ -763,8 +771,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (nlmsg_len(nlh) < sizeof(struct audit_rule))
                        return -EINVAL;
                if (audit_enabled == AUDIT_LOCKED) {
-                        audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
+                        audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
-                                                  loginuid, sessionid, sid);
+                                                  uid, loginuid, sessionid, sid);
                        audit_log_format(ab, " audit_enabled=%d res=0",
                                         audit_enabled);
@@ -773,8 +781,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                }
                /* fallthrough */
        case AUDIT_LIST:
-                err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
+                err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
-                                           seq, data, nlmsg_len(nlh),
+                                           uid, seq, data, nlmsg_len(nlh),
                                           loginuid, sessionid, sid);
                break;
        case AUDIT_ADD_RULE:
@@ -782,8 +790,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
                        return -EINVAL;
                if (audit_enabled == AUDIT_LOCKED) {
-                        audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
+                        audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
-                                                  loginuid, sessionid, sid);
+                                                  uid, loginuid, sessionid, sid);
                        audit_log_format(ab, " audit_enabled=%d res=0",
                                         audit_enabled);
@@ -792,15 +800,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                }
                /* fallthrough */
        case AUDIT_LIST_RULES:
-                err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
+                err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
-                                           seq, data, nlmsg_len(nlh),
+                                           uid, seq, data, nlmsg_len(nlh),
                                           loginuid, sessionid, sid);
                break;
        case AUDIT_TRIM:
                audit_trim_trees();
-                audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
+                audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
-                                          loginuid, sessionid, sid);
+                                          uid, loginuid, sessionid, sid);
                audit_log_format(ab, " op=trim res=1");
                audit_log_end(ab);
@@ -831,8 +839,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                /* OK, here comes... */
                err = audit_tag_tree(old, new);
-                audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
+                audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
-                                          loginuid, sessionid, sid);
+                                          uid, loginuid, sessionid, sid);
                audit_log_format(ab, " op=make_equiv old=");
                audit_log_untrustedstring(ab, old);
@@ -857,41 +865,53 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                security_release_secctx(ctx, len);
                        return -ENOMEM;
                }
-                sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
+                sig_data->uid = audit_sig_uid;
                sig_data->pid = audit_sig_pid;
                if (audit_sig_sid) {
                        memcpy(sig_data->ctx, ctx, len);
                        security_release_secctx(ctx, len);
                }
-                audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO,
+                audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
                                0, 0, sig_data, sizeof(*sig_data) + len);
                kfree(sig_data);
                break;
        case AUDIT_TTY_GET: {
                struct audit_tty_status s;
-                struct task_struct *tsk = current;
+                struct task_struct *tsk;
+                unsigned long flags;
-                spin_lock_irq(&tsk->sighand->siglock);
-                s.enabled = tsk->signal->audit_tty != 0;
+                rcu_read_lock();
-                spin_unlock_irq(&tsk->sighand->siglock);
+                tsk = find_task_by_vpid(pid);
+                if (tsk && lock_task_sighand(tsk, &flags)) {
-                audit_send_reply(NETLINK_CB(skb).portid, seq,
+                        s.enabled = tsk->signal->audit_tty != 0;
-                                 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
+                        unlock_task_sighand(tsk, &flags);
+                } else
+                        err = -ESRCH;
+                rcu_read_unlock();
+                if (!err)
+                        audit_send_reply(NETLINK_CB(skb).pid, seq,
+                                         AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
                break;
        }
        case AUDIT_TTY_SET: {
                struct audit_tty_status *s;
-                struct task_struct *tsk = current;
+                struct task_struct *tsk;
+                unsigned long flags;
                if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
                        return -EINVAL;
                s = data;
                if (s->enabled != 0 && s->enabled != 1)
                        return -EINVAL;
+                rcu_read_lock();
-                spin_lock_irq(&tsk->sighand->siglock);
+                tsk = find_task_by_vpid(pid);
-                tsk->signal->audit_tty = s->enabled != 0;
+                if (tsk && lock_task_sighand(tsk, &flags)) {
-                spin_unlock_irq(&tsk->sighand->siglock);
+                        tsk->signal->audit_tty = s->enabled != 0;
+                        unlock_task_sighand(tsk, &flags);
+                } else
+                        err = -ESRCH;
+                rcu_read_unlock();
                break;
        }
        default:
@@ -941,16 +961,14 @@ static void audit_receive(struct sk_buff  *skb)
 static int __init audit_init(void)
 {
        int i;
-        struct netlink_kernel_cfg cfg = {
-                .input  = audit_receive,
-        };
        if (audit_initialized == AUDIT_DISABLED)
                return 0;
        printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
               audit_default ? "enabled" : "disabled");
-        audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg);
+        audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
+                                           audit_receive, NULL, THIS_MODULE);
        if (!audit_sock)
                audit_panic("cannot initialize netlink socket");
        else
@@ -1042,15 +1060,13 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
        ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
        if (!ab->skb)
-                goto err;
+                goto nlmsg_failure;
-        nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
+        nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
-        if (!nlh)
-                goto out_kfree_skb;
        return ab;
-out_kfree_skb:
+nlmsg_failure:                  /* Used by NLMSG_NEW */
        kfree_skb(ab->skb);
        ab->skb = NULL;
 err:
@@ -1101,23 +1117,6 @@ static inline void audit_get_stamp(struct audit_context *ctx,
        }
 }
-/*
- * Wait for auditd to drain the queue a little
- */
-static void wait_for_auditd(unsigned long sleep_time)
-{
-        DECLARE_WAITQUEUE(wait, current);
-        set_current_state(TASK_INTERRUPTIBLE);
-        add_wait_queue(&audit_backlog_wait, &wait);
-        if (audit_backlog_limit &&
-            skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
-                schedule_timeout(sleep_time);
-        __set_current_state(TASK_RUNNING);
-        remove_wait_queue(&audit_backlog_wait, &wait);
-}
 /* Obtain an audit buffer.  This routine does locking to obtain the
 * audit buffer, but then no locking is required for calls to
 * audit_log_*format.  If the tsk is a task that is currently in a
@@ -1163,13 +1162,20 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
        while (audit_backlog_limit
               && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
-                if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+                if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
-                        unsigned long sleep_time;
+                    && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
+                        /* Wait for auditd to drain the queue a little */
+                        DECLARE_WAITQUEUE(wait, current);
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        add_wait_queue(&audit_backlog_wait, &wait);
-                        sleep_time = timeout_start + audit_backlog_wait_time -
+                        if (audit_backlog_limit &&
-                                        jiffies;
+                            skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
-                        if ((long)sleep_time > 0)
+                                schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
-                                wait_for_auditd(sleep_time);
+                        __set_current_state(TASK_RUNNING);
+                        remove_wait_queue(&audit_backlog_wait, &wait);
                        continue;
                }
                if (audit_rate_check() && printk_ratelimit())
@@ -1254,13 +1260,12 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
                avail = audit_expand(ab,
                        max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
                if (!avail)
-                        goto out_va_end;
+                        goto out;
                len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
        }
+        va_end(args2);
        if (len > 0)
                skb_put(skb, len);
-out_va_end:
-        va_end(args2);
 out:
        return;
 }
@@ -1412,12 +1417,12 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 /* This is a helper-function to print the escaped d_path */
 void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
-                      const struct path *path)
+                      struct path *path)
 {
        char *p, *pathname;
        if (prefix)
-                audit_log_format(ab, "%s", prefix);
+                audit_log_format(ab, " %s", prefix);
        /* We will allow 11 spaces for ' (deleted)' to be appended */
        pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
@@ -1444,29 +1449,6 @@ void audit_log_key(struct audit_buffer *ab, char *key)
 }
 /**
- * audit_log_link_denied - report a link restriction denial
- * @operation: specific link opreation
- * @link: the path that triggered the restriction
- */
-void audit_log_link_denied(const char *operation, struct path *link)
-{
-        struct audit_buffer *ab;
-        ab = audit_log_start(current->audit_context, GFP_KERNEL,
-                             AUDIT_ANOM_LINK);
-        if (!ab)
-                return;
-        audit_log_format(ab, "op=%s action=denied", operation);
-        audit_log_format(ab, " pid=%d comm=", current->pid);
-        audit_log_untrustedstring(ab, current->comm);
-        audit_log_d_path(ab, " path=", link);
-        audit_log_format(ab, " dev=");
-        audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id);
-        audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino);
-        audit_log_end(ab);
-}
-/**
 * audit_log_end - end one audit record
 * @ab: the audit_buffer
 *
diff --git a/kernel/audit.h b/kernel/audit.h
index d51cba868e1..91e7071c4d2 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -36,8 +36,12 @@ enum audit_state {
        AUDIT_DISABLED,         /* Do not create per-task audit_context.
                                 * No syscall-specific audit records can
                                 * be generated. */
+        AUDIT_SETUP_CONTEXT,    /* Create the per-task audit_context,
+                                 * but don't necessarily fill it in at
+                                 * syscall entry time (i.e., filter
+                                 * instead). */
        AUDIT_BUILD_CONTEXT,    /* Create the per-task audit_context,
-                                 * and fill it in at syscall
+                                 * and always fill it in at syscall
                                 * entry time.  This makes a full
                                 * syscall record available if some
                                 * other part of the kernel decides it
@@ -74,15 +78,10 @@ static inline int audit_hash_ino(u32 ino)
        return (ino & (AUDIT_INODE_BUCKETS-1));
 }
-/* Indicates that audit should log the full pathname. */
-#define AUDIT_NAME_FULL -1
 extern int audit_match_class(int class, unsigned syscall);
 extern int audit_comparator(const u32 left, const u32 op, const u32 right);
-extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
+extern int audit_compare_dname_path(const char *dname, const char *path,
-extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
+                                    int *dirlen);
-extern int parent_len(const char *path);
-extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
 extern struct sk_buff *     audit_make_reply(int pid, int seq, int type,
                                             int done, int multi,
                                             const void *payload, int size);
@@ -149,7 +148,7 @@ extern void audit_kill_trees(struct list_head *);
 extern char *audit_unpack_string(void **, size_t *, size_t);
 extern pid_t audit_sig_pid;
-extern kuid_t audit_sig_uid;
+extern uid_t audit_sig_uid;
 extern u32 audit_sig_sid;
 #ifdef CONFIG_AUDITSYSCALL
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 642a89c4f3d..5bf0790497e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -249,7 +249,8 @@ static void untag_chunk(struct node *p)
                list_del_rcu(&chunk->hash);
                spin_unlock(&hash_lock);
                spin_unlock(&entry->lock);
-                fsnotify_destroy_mark(entry, audit_tree_group);
+                fsnotify_destroy_mark(entry);
+                fsnotify_put_mark(entry);
                goto out;
        }
@@ -258,7 +259,7 @@ static void untag_chunk(struct node *p)
        fsnotify_duplicate_mark(&new->mark, entry);
        if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
-                fsnotify_put_mark(&new->mark);
+                free_chunk(new);
                goto Fallback;
        }
@@ -291,8 +292,8 @@ static void untag_chunk(struct node *p)
                owner->root = new;
        spin_unlock(&hash_lock);
        spin_unlock(&entry->lock);
-        fsnotify_destroy_mark(entry, audit_tree_group);
+        fsnotify_destroy_mark(entry);
-        fsnotify_put_mark(&new->mark);  /* drop initial reference */
+        fsnotify_put_mark(entry);
        goto out;
 Fallback:
@@ -321,7 +322,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
        entry = &chunk->mark;
        if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
-                fsnotify_put_mark(entry);
+                free_chunk(chunk);
                return -ENOSPC;
        }
@@ -331,7 +332,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
                spin_unlock(&hash_lock);
                chunk->dead = 1;
                spin_unlock(&entry->lock);
-                fsnotify_destroy_mark(entry, audit_tree_group);
+                fsnotify_destroy_mark(entry);
                fsnotify_put_mark(entry);
                return 0;
        }
@@ -346,7 +347,6 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
        insert_hash(chunk);
        spin_unlock(&hash_lock);
        spin_unlock(&entry->lock);
-        fsnotify_put_mark(entry);       /* drop initial reference */
        return 0;
 }
@@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        fsnotify_duplicate_mark(chunk_entry, old_entry);
        if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
                spin_unlock(&old_entry->lock);
-                fsnotify_put_mark(chunk_entry);
+                free_chunk(chunk);
                fsnotify_put_mark(old_entry);
                return -ENOSPC;
        }
@@ -412,7 +412,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
                spin_unlock(&chunk_entry->lock);
                spin_unlock(&old_entry->lock);
-                fsnotify_destroy_mark(chunk_entry, audit_tree_group);
+                fsnotify_destroy_mark(chunk_entry);
                fsnotify_put_mark(chunk_entry);
                fsnotify_put_mark(old_entry);
@@ -443,32 +443,17 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        spin_unlock(&hash_lock);
        spin_unlock(&chunk_entry->lock);
        spin_unlock(&old_entry->lock);
-        fsnotify_destroy_mark(old_entry, audit_tree_group);
+        fsnotify_destroy_mark(old_entry);
-        fsnotify_put_mark(chunk_entry); /* drop initial reference */
        fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
+        fsnotify_put_mark(old_entry); /* and kill it */
        return 0;
 }
-static void audit_log_remove_rule(struct audit_krule *rule)
-{
-        struct audit_buffer *ab;
-        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
-        if (unlikely(!ab))
-                return;
-        audit_log_format(ab, "op=");
-        audit_log_string(ab, "remove rule");
-        audit_log_format(ab, " dir=");
-        audit_log_untrustedstring(ab, rule->tree->pathname);
-        audit_log_key(ab, rule->filterkey);
-        audit_log_format(ab, " list=%d res=1", rule->listnr);
-        audit_log_end(ab);
-}
 static void kill_rules(struct audit_tree *tree)
 {
        struct audit_krule *rule, *next;
        struct audit_entry *entry;
+        struct audit_buffer *ab;
        list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
                entry = container_of(rule, struct audit_entry, rule);
@@ -476,7 +461,14 @@ static void kill_rules(struct audit_tree *tree)
                list_del_init(&rule->rlist);
                if (rule->tree) {
                        /* not a half-baked one */
-                        audit_log_remove_rule(rule);
+                        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+                        audit_log_format(ab, "op=");
+                        audit_log_string(ab, "remove rule");
+                        audit_log_format(ab, " dir=");
+                        audit_log_untrustedstring(ab, rule->tree->pathname);
+                        audit_log_key(ab, rule->filterkey);
+                        audit_log_format(ab, " list=%d res=1", rule->listnr);
+                        audit_log_end(ab);
                        rule->tree = NULL;
                        list_del_rcu(&entry->list);
                        list_del(&entry->rule.list);
@@ -603,7 +595,7 @@ void audit_trim_trees(void)
                root_mnt = collect_mounts(&path);
                path_put(&path);
-                if (IS_ERR(root_mnt))
+                if (!root_mnt)
                        goto skip_it;
                spin_lock(&hash_lock);
@@ -677,8 +669,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
                goto Err;
        mnt = collect_mounts(&path);
        path_put(&path);
-        if (IS_ERR(mnt)) {
+        if (!mnt) {
-                err = PTR_ERR(mnt);
+                err = -ENOMEM;
                goto Err;
        }
@@ -727,8 +719,8 @@ int audit_tag_tree(char *old, char *new)
                return err;
        tagged = collect_mounts(&path2);
        path_put(&path2);
-        if (IS_ERR(tagged))
+        if (!tagged)
-                return PTR_ERR(tagged);
+                return -ENOMEM;
        err = kern_path(old, 0, &path1);
        if (err) {
@@ -924,12 +916,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
        struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
        evict_chunk(chunk);
+        fsnotify_put_mark(entry);
-        /*
-         * We are guaranteed to have at least one reference to the mark from
-         * either the inode or the caller of fsnotify_destroy_mark().
-         */
-        BUG_ON(atomic_read(&entry->refcnt) < 1);
 }
 static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 22831c4d369..e683869365d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -240,10 +240,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
        if (audit_enabled) {
                struct audit_buffer *ab;
                ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
-                if (unlikely(!ab))
-                        return;
                audit_log_format(ab, "auid=%u ses=%u op=",
-                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
+                                 audit_get_loginuid(current),
                                 audit_get_sessionid(current));
                audit_log_string(ab, op);
                audit_log_format(ab, " path=");
@@ -267,8 +265,7 @@ static void audit_update_watch(struct audit_parent *parent,
        /* Run all of the watches on this parent looking for the one that
         * matches the given dname */
        list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
-                if (audit_compare_dname_path(dname, owatch->path,
+                if (audit_compare_dname_path(dname, owatch->path, NULL))
-                                             AUDIT_NAME_FULL))
                        continue;
                /* If the update involves invalidating rules, do the inode-based
@@ -352,21 +349,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
        }
        mutex_unlock(&audit_filter_mutex);
-        fsnotify_destroy_mark(&parent->mark, audit_watch_group);
+        fsnotify_destroy_mark(&parent->mark);
 }
 /* Get path information necessary for adding watches. */
 static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 {
-        struct dentry *d = kern_path_locked(watch->path, parent);
+        struct nameidata nd;
-        if (IS_ERR(d))
+        struct dentry *d;
+        int err;
+        err = kern_path_parent(watch->path, &nd);
+        if (err)
+                return err;
+        if (nd.last_type != LAST_NORM) {
+                path_put(&nd.path);
+                return -EINVAL;
+        }
+        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+        d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
+        if (IS_ERR(d)) {
+                mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+                path_put(&nd.path);
                return PTR_ERR(d);
-        mutex_unlock(&parent->dentry->d_inode->i_mutex);
+        }
        if (d->d_inode) {
                /* update watch filter fields */
                watch->dev = d->d_inode->i_sb->s_dev;
                watch->ino = d->d_inode->i_ino;
        }
+        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+        *parent = nd.path;
        dput(d);
        return 0;
 }
@@ -459,7 +475,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
                if (list_empty(&parent->watches)) {
                        audit_get_parent(parent);
-                        fsnotify_destroy_mark(&parent->mark, audit_watch_group);
+                        fsnotify_destroy_mark(&parent->mark);
                        audit_put_parent(parent);
                }
        }
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f9fc54bbe06..f8277c80d67 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -235,15 +235,13 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
        switch(listnr) {
        default:
                goto exit_err;
+        case AUDIT_FILTER_USER:
+        case AUDIT_FILTER_TYPE:
 #ifdef CONFIG_AUDITSYSCALL
        case AUDIT_FILTER_ENTRY:
-                if (rule->action == AUDIT_ALWAYS)
-                        goto exit_err;
        case AUDIT_FILTER_EXIT:
        case AUDIT_FILTER_TASK:
 #endif
-        case AUDIT_FILTER_USER:
-        case AUDIT_FILTER_TYPE:
                ;
        }
        if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -342,8 +340,6 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
                f->val = rule->values[i];
-                f->uid = INVALID_UID;
-                f->gid = INVALID_GID;
                err = -EINVAL;
                if (f->op == Audit_bad)
@@ -352,32 +348,16 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                switch(f->type) {
                default:
                        goto exit_free;
+                case AUDIT_PID:
                case AUDIT_UID:
                case AUDIT_EUID:
                case AUDIT_SUID:
                case AUDIT_FSUID:
-                case AUDIT_LOGINUID:
-                        /* bit ops not implemented for uid comparisons */
-                        if (f->op == Audit_bitmask || f->op == Audit_bittest)
-                                goto exit_free;
-                        f->uid = make_kuid(current_user_ns(), f->val);
-                        if (!uid_valid(f->uid))
-                                goto exit_free;
-                        break;
                case AUDIT_GID:
                case AUDIT_EGID:
                case AUDIT_SGID:
                case AUDIT_FSGID:
-                        /* bit ops not implemented for gid comparisons */
+                case AUDIT_LOGINUID:
-                        if (f->op == Audit_bitmask || f->op == Audit_bittest)
-                                goto exit_free;
-                        f->gid = make_kgid(current_user_ns(), f->val);
-                        if (!gid_valid(f->gid))
-                                goto exit_free;
-                        break;
-                case AUDIT_PID:
                case AUDIT_PERS:
                case AUDIT_MSGTYPE:
                case AUDIT_PPID:
@@ -405,7 +385,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                                goto exit_free;
                        break;
                case AUDIT_FILETYPE:
-                        if (f->val & ~S_IFMT)
+                        if ((f->val & ~S_IFMT) > S_IFMT)
                                goto exit_free;
                        break;
                case AUDIT_INODE:
@@ -455,39 +435,19 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                f->type = data->fields[i];
                f->val = data->values[i];
-                f->uid = INVALID_UID;
-                f->gid = INVALID_GID;
                f->lsm_str = NULL;
                f->lsm_rule = NULL;
                switch(f->type) {
+                case AUDIT_PID:
                case AUDIT_UID:
                case AUDIT_EUID:
                case AUDIT_SUID:
                case AUDIT_FSUID:
-                case AUDIT_LOGINUID:
-                case AUDIT_OBJ_UID:
-                        /* bit ops not implemented for uid comparisons */
-                        if (f->op == Audit_bitmask || f->op == Audit_bittest)
-                                goto exit_free;
-                        f->uid = make_kuid(current_user_ns(), f->val);
-                        if (!uid_valid(f->uid))
-                                goto exit_free;
-                        break;
                case AUDIT_GID:
                case AUDIT_EGID:
                case AUDIT_SGID:
                case AUDIT_FSGID:
-                case AUDIT_OBJ_GID:
+                case AUDIT_LOGINUID:
-                        /* bit ops not implemented for gid comparisons */
-                        if (f->op == Audit_bitmask || f->op == Audit_bittest)
-                                goto exit_free;
-                        f->gid = make_kgid(current_user_ns(), f->val);
-                        if (!gid_valid(f->gid))
-                                goto exit_free;
-                        break;
-                case AUDIT_PID:
                case AUDIT_PERS:
                case AUDIT_MSGTYPE:
                case AUDIT_PPID:
@@ -562,6 +522,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                                goto exit_free;
                        break;
                case AUDIT_FILTERKEY:
+                        err = -EINVAL;
                        if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
                                goto exit_free;
                        str = audit_unpack_string(&bufp, &remain, f->val);
@@ -575,11 +536,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                                goto exit_free;
                        break;
                case AUDIT_FILETYPE:
-                        if (f->val & ~S_IFMT)
+                        if ((f->val & ~S_IFMT) > S_IFMT)
-                                goto exit_free;
-                        break;
-                case AUDIT_FIELD_COMPARE:
-                        if (f->val > AUDIT_MAX_FIELD_COMPARE)
                                goto exit_free;
                        break;
                default:
@@ -743,23 +700,6 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
                        if (strcmp(a->filterkey, b->filterkey))
                                return 1;
                        break;
-                case AUDIT_UID:
-                case AUDIT_EUID:
-                case AUDIT_SUID:
-                case AUDIT_FSUID:
-                case AUDIT_LOGINUID:
-                case AUDIT_OBJ_UID:
-                        if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
-                                return 1;
-                        break;
-                case AUDIT_GID:
-                case AUDIT_EGID:
-                case AUDIT_SGID:
-                case AUDIT_FSGID:
-                case AUDIT_OBJ_GID:
-                        if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
-                                return 1;
-                        break;
                default:
                        if (a->fields[i].val != b->fields[i].val)
                                return 1;
@@ -1109,7 +1049,7 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
 }
 /* Log rule additions and removals */
-static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
+static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
                                  char *action, struct audit_krule *rule,
                                  int res)
 {
@@ -1121,8 +1061,7 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        if (!ab)
                return;
-        audit_log_format(ab, "auid=%u ses=%u",
+        audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
-                         from_kuid(&init_user_ns, loginuid), sessionid);
        if (sid) {
                char *ctx = NULL;
                u32 len;
@@ -1144,6 +1083,7 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
 * audit_receive_filter - apply all rules to the specified message type
 * @type: audit message type
 * @pid: target pid for netlink audit messages
+ * @uid: target uid for netlink audit messages
 * @seq: netlink audit message sequence (serial) number
 * @data: payload data
 * @datasz: size of payload data
@@ -1151,8 +1091,8 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
 * @sessionid: sessionid for netlink audit message
 * @sid: SE Linux Security ID of sender
 */
-int audit_receive_filter(int type, int pid, int seq, void *data,
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
-                         size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid)
+                         size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
 {
        struct task_struct *tsk;
        struct audit_netlink_list *dest;
@@ -1251,110 +1191,46 @@ int audit_comparator(u32 left, u32 op, u32 right)
        }
 }
-int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
+/* Compare given dentry name with last component in given path,
+ * return of 0 indicates a match. */
+int audit_compare_dname_path(const char *dname, const char *path,
+                             int *dirlen)
 {
-        switch (op) {
+        int dlen, plen;
-        case Audit_equal:
-                return uid_eq(left, right);
-        case Audit_not_equal:
-                return !uid_eq(left, right);
-        case Audit_lt:
-                return uid_lt(left, right);
-        case Audit_le:
-                return uid_lte(left, right);
-        case Audit_gt:
-                return uid_gt(left, right);
-        case Audit_ge:
-                return uid_gte(left, right);
-        case Audit_bitmask:
-        case Audit_bittest:
-        default:
-                BUG();
-                return 0;
-        }
-}
-int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
-{
-        switch (op) {
-        case Audit_equal:
-                return gid_eq(left, right);
-        case Audit_not_equal:
-                return !gid_eq(left, right);
-        case Audit_lt:
-                return gid_lt(left, right);
-        case Audit_le:
-                return gid_lte(left, right);
-        case Audit_gt:
-                return gid_gt(left, right);
-        case Audit_ge:
-                return gid_gte(left, right);
-        case Audit_bitmask:
-        case Audit_bittest:
-        default:
-                BUG();
-                return 0;
-        }
-}
-/**
- * parent_len - find the length of the parent portion of a pathname
- * @path: pathname of which to determine length
- */
-int parent_len(const char *path)
-{
-        int plen;
        const char *p;
-        plen = strlen(path);
+        if (!dname || !path)
+                return 1;
-        if (plen == 0)
+        dlen = strlen(dname);
-                return plen;
+        plen = strlen(path);
+        if (plen < dlen)
+                return 1;
        /* disregard trailing slashes */
        p = path + plen - 1;
        while ((*p == '/') && (p > path))
                p--;
-        /* walk backward until we find the next slash or hit beginning */
+        /* find last path component */
-        while ((*p != '/') && (p > path))
+        p = p - dlen + 1;
-                p--;
+        if (p < path)
-        /* did we find a slash? Then increment to include it in path */
-        if (*p == '/')
-                p++;
-        return p - path;
-}
-/**
- * audit_compare_dname_path - compare given dentry name with last component in
- *                            given path. Return of 0 indicates a match.
- * @dname:      dentry name that we're comparing
- * @path:       full pathname that we're comparing
- * @parentlen:  length of the parent if known. Passing in AUDIT_NAME_FULL
- *              here indicates that we must compute this value.
- */
-int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
-{
-        int dlen, pathlen;
-        const char *p;
-        dlen = strlen(dname);
-        pathlen = strlen(path);
-        if (pathlen < dlen)
                return 1;
+        else if (p > path) {
+                if (*--p != '/')
+                        return 1;
+                else
+                        p++;
+        }
-        parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
+        /* return length of path's directory component */
-        if (pathlen - parentlen != dlen)
+        if (dirlen)
-                return 1;
+                *dirlen = p - path;
-        p = path + parentlen;
        return strncmp(p, dname, dlen);
 }
-static int audit_filter_user_rules(struct audit_krule *rule,
+static int audit_filter_user_rules(struct netlink_skb_parms *cb,
+                                   struct audit_krule *rule,
                                   enum audit_state *state)
 {
        int i;
@@ -1366,17 +1242,17 @@ static int audit_filter_user_rules(struct audit_krule *rule,
                switch (f->type) {
                case AUDIT_PID:
-                        result = audit_comparator(task_pid_vnr(current), f->op, f->val);
+                        result = audit_comparator(cb->creds.pid, f->op, f->val);
                        break;
                case AUDIT_UID:
-                        result = audit_uid_comparator(current_uid(), f->op, f->uid);
+                        result = audit_comparator(cb->creds.uid, f->op, f->val);
                        break;
                case AUDIT_GID:
-                        result = audit_gid_comparator(current_gid(), f->op, f->gid);
+                        result = audit_comparator(cb->creds.gid, f->op, f->val);
                        break;
                case AUDIT_LOGINUID:
-                        result = audit_uid_comparator(audit_get_loginuid(current),
+                        result = audit_comparator(audit_get_loginuid(current),
-                                                  f->op, f->uid);
+                                                  f->op, f->val);
                        break;
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
@@ -1404,7 +1280,7 @@ static int audit_filter_user_rules(struct audit_krule *rule,
        return 1;
 }
-int audit_filter_user(void)
+int audit_filter_user(struct netlink_skb_parms *cb)
 {
        enum audit_state state = AUDIT_DISABLED;
        struct audit_entry *e;
@@ -1412,7 +1288,7 @@ int audit_filter_user(void)
        rcu_read_lock();
        list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
-                if (audit_filter_user_rules(&e->rule, &state)) {
+                if (audit_filter_user_rules(cb, &e->rule, &state)) {
                        if (state == AUDIT_DISABLED)
                                ret = 0;
                        break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a371f857a0a..ce4b054acee 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -48,7 +48,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/socket.h>
@@ -67,19 +67,15 @@
 #include <linux/syscalls.h>
 #include <linux/capability.h>
 #include <linux/fs_struct.h>
-#include <linux/compat.h>
 #include "audit.h"
-/* flags stating the success for a syscall */
-#define AUDITSC_INVALID 0
-#define AUDITSC_SUCCESS 1
-#define AUDITSC_FAILURE 2
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname().  If we get more names we will allocate
+ * for saving names from getname(). */
- * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES    20
-#define AUDIT_NAMES     5
+/* Indicates that audit should log the full pathname. */
+#define AUDIT_NAME_FULL -1
 /* no execve audit message should be longer than this (userspace limits) */
 #define MAX_EXECVE_AUDIT_LEN 7500
@@ -103,29 +99,20 @@ struct audit_cap_data {
 * we don't let putname() free it (instead we free all of the saved
 * pointers at syscall exit time).
 *
- * Further, in fs/namei.c:path_lookup() we store the inode and device.
+ * Further, in fs/namei.c:path_lookup() we store the inode and device. */
- */
 struct audit_names {
-        struct list_head        list;           /* audit_context->names_list */
+        const char      *name;
-        struct filename *name;
+        int             name_len;       /* number of name's characters to log */
-        unsigned long           ino;
+        unsigned        name_put;       /* call __putname() for this name */
-        dev_t                   dev;
+        unsigned long   ino;
-        umode_t                 mode;
+        dev_t           dev;
-        kuid_t                  uid;
+        umode_t         mode;
-        kgid_t                  gid;
+        uid_t           uid;
-        dev_t                   rdev;
+        gid_t           gid;
-        u32                     osid;
+        dev_t           rdev;
-        struct audit_cap_data    fcap;
+        u32             osid;
-        unsigned int            fcap_ver;
+        struct audit_cap_data fcap;
-        int                     name_len;       /* number of name's characters to log */
+        unsigned int    fcap_ver;
-        unsigned char           type;           /* record type */
-        bool                    name_put;       /* call __putname() for this name */
-        /*
-         * This was an allocated audit_names and not from the array of
-         * names allocated in the task audit context.  Thus this name
-         * should be freed on syscall exit
-         */
-        bool                    should_free;
 };
 struct audit_aux_data {
@@ -148,8 +135,8 @@ struct audit_aux_data_execve {
 struct audit_aux_data_pids {
        struct audit_aux_data   d;
        pid_t                   target_pid[AUDIT_AUX_PIDS];
-        kuid_t                  target_auid[AUDIT_AUX_PIDS];
+        uid_t                   target_auid[AUDIT_AUX_PIDS];
-        kuid_t                  target_uid[AUDIT_AUX_PIDS];
+        uid_t                   target_uid[AUDIT_AUX_PIDS];
        unsigned int            target_sessionid[AUDIT_AUX_PIDS];
        u32                     target_sid[AUDIT_AUX_PIDS];
        char                    target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
@@ -187,33 +174,25 @@ struct audit_context {
        long                return_code;/* syscall return code */
        u64                 prio;
        int                 return_valid; /* return code is valid */
-        /*
+        int                 name_count;
-         * The names_list is the list of all audit_names collected during this
+        struct audit_names  names[AUDIT_NAMES];
-         * syscall.  The first AUDIT_NAMES entries in the names_list will
-         * actually be from the preallocated_names array for performance
-         * reasons.  Except during allocation they should never be referenced
-         * through the preallocated_names array and should only be found/used
-         * by running the names_list.
-         */
-        struct audit_names  preallocated_names[AUDIT_NAMES];
-        int                 name_count; /* total records in names_list */
-        struct list_head    names_list; /* anchor for struct audit_names->list */
        char *              filterkey;  /* key for rule that triggered record */
        struct path         pwd;
+        struct audit_context *previous; /* For nested syscalls */
        struct audit_aux_data *aux;
        struct audit_aux_data *aux_pids;
        struct sockaddr_storage *sockaddr;
        size_t sockaddr_len;
                                /* Save things to print about task_struct */
        pid_t               pid, ppid;
-        kuid_t              uid, euid, suid, fsuid;
+        uid_t               uid, euid, suid, fsuid;
-        kgid_t              gid, egid, sgid, fsgid;
+        gid_t               gid, egid, sgid, fsgid;
        unsigned long       personality;
        int                 arch;
        pid_t               target_pid;
-        kuid_t              target_auid;
+        uid_t               target_auid;
-        kuid_t              target_uid;
+        uid_t               target_uid;
        unsigned int        target_sessionid;
        u32                 target_sid;
        char                target_comm[TASK_COMM_LEN];
@@ -229,14 +208,14 @@ struct audit_context {
                        long args[6];
                } socketcall;
                struct {
-                        kuid_t                  uid;
+                        uid_t                   uid;
-                        kgid_t                  gid;
+                        gid_t                   gid;
-                        umode_t                 mode;
+                        mode_t                  mode;
                        u32                     osid;
                        int                     has_perm;
                        uid_t                   perm_uid;
                        gid_t                   perm_gid;
-                        umode_t                 perm_mode;
+                        mode_t                  perm_mode;
                        unsigned long           qbytes;
                } ipc;
                struct {
@@ -255,7 +234,7 @@ struct audit_context {
                } mq_sendrecv;
                struct {
                        int                     oflag;
-                        umode_t                 mode;
+                        mode_t                  mode;
                        struct mq_attr          attr;
                } mq_open;
                struct {
@@ -326,21 +305,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
        }
 }
-static int audit_match_filetype(struct audit_context *ctx, int val)
+static int audit_match_filetype(struct audit_context *ctx, int which)
 {
-        struct audit_names *n;
+        unsigned index = which & ~S_IFMT;
-        umode_t mode = (umode_t)val;
+        mode_t mode = which & S_IFMT;
        if (unlikely(!ctx))
                return 0;
-        list_for_each_entry(n, &ctx->names_list, list) {
+        if (index >= ctx->name_count)
-                if ((n->ino != -1) &&
+                return 0;
-                    ((n->mode & S_IFMT) == mode))
+        if (ctx->names[index].ino == -1)
-                        return 1;
+                return 0;
-        }
+        if ((ctx->names[index].mode ^ mode) & S_IFMT)
+                return 0;
-        return 0;
+        return 1;
 }
 /*
@@ -462,126 +441,6 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
        return 0;
 }
-static int audit_compare_uid(kuid_t uid,
-                             struct audit_names *name,
-                             struct audit_field *f,
-                             struct audit_context *ctx)
-{
-        struct audit_names *n;
-        int rc;
- 
-        if (name) {
-                rc = audit_uid_comparator(uid, f->op, name->uid);
-                if (rc)
-                        return rc;
-        }
- 
-        if (ctx) {
-                list_for_each_entry(n, &ctx->names_list, list) {
-                        rc = audit_uid_comparator(uid, f->op, n->uid);
-                        if (rc)
-                                return rc;
-                }
-        }
-        return 0;
-}
-static int audit_compare_gid(kgid_t gid,
-                             struct audit_names *name,
-                             struct audit_field *f,
-                             struct audit_context *ctx)
-{
-        struct audit_names *n;
-        int rc;
- 
-        if (name) {
-                rc = audit_gid_comparator(gid, f->op, name->gid);
-                if (rc)
-                        return rc;
-        }
- 
-        if (ctx) {
-                list_for_each_entry(n, &ctx->names_list, list) {
-                        rc = audit_gid_comparator(gid, f->op, n->gid);
-                        if (rc)
-                                return rc;
-                }
-        }
-        return 0;
-}
-static int audit_field_compare(struct task_struct *tsk,
-                               const struct cred *cred,
-                               struct audit_field *f,
-                               struct audit_context *ctx,
-                               struct audit_names *name)
-{
-        switch (f->val) {
-        /* process to file object comparisons */
-        case AUDIT_COMPARE_UID_TO_OBJ_UID:
-                return audit_compare_uid(cred->uid, name, f, ctx);
-        case AUDIT_COMPARE_GID_TO_OBJ_GID:
-                return audit_compare_gid(cred->gid, name, f, ctx);
-        case AUDIT_COMPARE_EUID_TO_OBJ_UID:
-                return audit_compare_uid(cred->euid, name, f, ctx);
-        case AUDIT_COMPARE_EGID_TO_OBJ_GID:
-                return audit_compare_gid(cred->egid, name, f, ctx);
-        case AUDIT_COMPARE_AUID_TO_OBJ_UID:
-                return audit_compare_uid(tsk->loginuid, name, f, ctx);
-        case AUDIT_COMPARE_SUID_TO_OBJ_UID:
-                return audit_compare_uid(cred->suid, name, f, ctx);
-        case AUDIT_COMPARE_SGID_TO_OBJ_GID:
-                return audit_compare_gid(cred->sgid, name, f, ctx);
-        case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
-                return audit_compare_uid(cred->fsuid, name, f, ctx);
-        case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
-                return audit_compare_gid(cred->fsgid, name, f, ctx);
-        /* uid comparisons */
-        case AUDIT_COMPARE_UID_TO_AUID:
-                return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
-        case AUDIT_COMPARE_UID_TO_EUID:
-                return audit_uid_comparator(cred->uid, f->op, cred->euid);
-        case AUDIT_COMPARE_UID_TO_SUID:
-                return audit_uid_comparator(cred->uid, f->op, cred->suid);
-        case AUDIT_COMPARE_UID_TO_FSUID:
-                return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
-        /* auid comparisons */
-        case AUDIT_COMPARE_AUID_TO_EUID:
-                return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
-        case AUDIT_COMPARE_AUID_TO_SUID:
-                return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
-        case AUDIT_COMPARE_AUID_TO_FSUID:
-                return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
-        /* euid comparisons */
-        case AUDIT_COMPARE_EUID_TO_SUID:
-                return audit_uid_comparator(cred->euid, f->op, cred->suid);
-        case AUDIT_COMPARE_EUID_TO_FSUID:
-                return audit_uid_comparator(cred->euid, f->op, cred->fsuid);
-        /* suid comparisons */
-        case AUDIT_COMPARE_SUID_TO_FSUID:
-                return audit_uid_comparator(cred->suid, f->op, cred->fsuid);
-        /* gid comparisons */
-        case AUDIT_COMPARE_GID_TO_EGID:
-                return audit_gid_comparator(cred->gid, f->op, cred->egid);
-        case AUDIT_COMPARE_GID_TO_SGID:
-                return audit_gid_comparator(cred->gid, f->op, cred->sgid);
-        case AUDIT_COMPARE_GID_TO_FSGID:
-                return audit_gid_comparator(cred->gid, f->op, cred->fsgid);
-        /* egid comparisons */
-        case AUDIT_COMPARE_EGID_TO_SGID:
-                return audit_gid_comparator(cred->egid, f->op, cred->sgid);
-        case AUDIT_COMPARE_EGID_TO_FSGID:
-                return audit_gid_comparator(cred->egid, f->op, cred->fsgid);
-        /* sgid comparison */
-        case AUDIT_COMPARE_SGID_TO_FSGID:
-                return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);
-        default:
-                WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
-                return 0;
-        }
-        return 0;
-}
 /* Determine if any context name data matches a rule's watch data */
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
 * otherwise.
@@ -598,14 +457,13 @@ static int audit_filter_rules(struct task_struct *tsk,
                              bool task_creation)
 {
        const struct cred *cred;
-        int i, need_sid = 1;
+        int i, j, need_sid = 1;
        u32 sid;
        cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
-                struct audit_names *n;
                int result = 0;
                switch (f->type) {
@@ -620,28 +478,28 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_UID:
-                        result = audit_uid_comparator(cred->uid, f->op, f->uid);
+                        result = audit_comparator(cred->uid, f->op, f->val);
                        break;
                case AUDIT_EUID:
-                        result = audit_uid_comparator(cred->euid, f->op, f->uid);
+                        result = audit_comparator(cred->euid, f->op, f->val);
                        break;
                case AUDIT_SUID:
-                        result = audit_uid_comparator(cred->suid, f->op, f->uid);
+                        result = audit_comparator(cred->suid, f->op, f->val);
                        break;
                case AUDIT_FSUID:
-                        result = audit_uid_comparator(cred->fsuid, f->op, f->uid);
+                        result = audit_comparator(cred->fsuid, f->op, f->val);
                        break;
                case AUDIT_GID:
-                        result = audit_gid_comparator(cred->gid, f->op, f->gid);
+                        result = audit_comparator(cred->gid, f->op, f->val);
                        break;
                case AUDIT_EGID:
-                        result = audit_gid_comparator(cred->egid, f->op, f->gid);
+                        result = audit_comparator(cred->egid, f->op, f->val);
                        break;
                case AUDIT_SGID:
-                        result = audit_gid_comparator(cred->sgid, f->op, f->gid);
+                        result = audit_comparator(cred->sgid, f->op, f->val);
                        break;
                case AUDIT_FSGID:
-                        result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
+                        result = audit_comparator(cred->fsgid, f->op, f->val);
                        break;
                case AUDIT_PERS:
                        result = audit_comparator(tsk->personality, f->op, f->val);
@@ -664,14 +522,12 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_DEVMAJOR:
-                        if (name) {
+                        if (name)
-                                if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
+                                result = audit_comparator(MAJOR(name->dev),
-                                    audit_comparator(MAJOR(name->rdev), f->op, f->val))
+                                                          f->op, f->val);
-                                        ++result;
+                        else if (ctx) {
-                        } else if (ctx) {
+                                for (j = 0; j < ctx->name_count; j++) {
-                                list_for_each_entry(n, &ctx->names_list, list) {
+                                        if (audit_comparator(MAJOR(ctx->names[j].dev),  f->op, f->val)) {
-                                        if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
-                                            audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -679,14 +535,12 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_DEVMINOR:
-                        if (name) {
+                        if (name)
-                                if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
+                                result = audit_comparator(MINOR(name->dev),
-                                    audit_comparator(MINOR(name->rdev), f->op, f->val))
+                                                          f->op, f->val);
-                                        ++result;
+                        else if (ctx) {
-                        } else if (ctx) {
+                                for (j = 0; j < ctx->name_count; j++) {
-                                list_for_each_entry(n, &ctx->names_list, list) {
+                                        if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
-                                        if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
-                                            audit_comparator(MINOR(n->rdev), f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -697,32 +551,8 @@ static int audit_filter_rules(struct task_struct *tsk,
                        if (name)
                                result = (name->ino == f->val);
                        else if (ctx) {
-                                list_for_each_entry(n, &ctx->names_list, list) {
+                                for (j = 0; j < ctx->name_count; j++) {
-                                        if (audit_comparator(n->ino, f->op, f->val)) {
+                                        if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
-                                                ++result;
-                                                break;
-                                        }
-                                }
-                        }
-                        break;
-                case AUDIT_OBJ_UID:
-                        if (name) {
-                                result = audit_uid_comparator(name->uid, f->op, f->uid);
-                        } else if (ctx) {
-                                list_for_each_entry(n, &ctx->names_list, list) {
-                                        if (audit_uid_comparator(n->uid, f->op, f->uid)) {
-                                                ++result;
-                                                break;
-                                        }
-                                }
-                        }
-                        break;
-                case AUDIT_OBJ_GID:
-                        if (name) {
-                                result = audit_gid_comparator(name->gid, f->op, f->gid);
-                        } else if (ctx) {
-                                list_for_each_entry(n, &ctx->names_list, list) {
-                                        if (audit_gid_comparator(n->gid, f->op, f->gid)) {
                                                ++result;
                                                break;
                                        }
@@ -740,7 +570,7 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_LOGINUID:
                        result = 0;
                        if (ctx)
-                                result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
+                                result = audit_comparator(tsk->loginuid, f->op, f->val);
                        break;
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
@@ -777,10 +607,11 @@ static int audit_filter_rules(struct task_struct *tsk,
                                                   name->osid, f->type, f->op,
                                                   f->lsm_rule, ctx);
                                } else if (ctx) {
-                                        list_for_each_entry(n, &ctx->names_list, list) {
+                                        for (j = 0; j < ctx->name_count; j++) {
-                                                if (security_audit_rule_match(n->osid, f->type,
+                                                if (security_audit_rule_match(
-                                                                              f->op, f->lsm_rule,
+                                                      ctx->names[j].osid,
-                                                                              ctx)) {
+                                                      f->type, f->op,
+                                                      f->lsm_rule, ctx)) {
                                                        ++result;
                                                        break;
                                                }
@@ -812,10 +643,8 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_FILETYPE:
                        result = audit_match_filetype(ctx, f->val);
                        break;
-                case AUDIT_FIELD_COMPARE:
-                        result = audit_field_compare(tsk, cred, f, ctx, name);
-                        break;
                }
                if (!result)
                        return 0;
        }
@@ -893,53 +722,40 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
        return AUDIT_BUILD_CONTEXT;
 }
-/*
+/* At syscall exit time, this filter is called if any audit_names[] have been
- * Given an audit_name check the inode hash table to see if they match.
- * Called holding the rcu read lock to protect the use of audit_inode_hash
- */
-static int audit_filter_inode_name(struct task_struct *tsk,
-                                   struct audit_names *n,
-                                   struct audit_context *ctx) {
-        int word, bit;
-        int h = audit_hash_ino((u32)n->ino);
-        struct list_head *list = &audit_inode_hash[h];
-        struct audit_entry *e;
-        enum audit_state state;
-        word = AUDIT_WORD(ctx->major);
-        bit  = AUDIT_BIT(ctx->major);
-        if (list_empty(list))
-                return 0;
-        list_for_each_entry_rcu(e, list, list) {
-                if ((e->rule.mask[word] & bit) == bit &&
-                    audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
-                        ctx->current_state = state;
-                        return 1;
-                }
-        }
-        return 0;
-}
-/* At syscall exit time, this filter is called if any audit_names have been
 * collected during syscall processing.  We only check rules in sublists at hash
- * buckets applicable to the inode numbers in audit_names.
+ * buckets applicable to the inode numbers in audit_names[].
 * Regarding audit_state, same rules apply as for audit_filter_syscall().
 */
 void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 {
-        struct audit_names *n;
+        int i;
+        struct audit_entry *e;
+        enum audit_state state;
        if (audit_pid && tsk->tgid == audit_pid)
                return;
        rcu_read_lock();
+        for (i = 0; i < ctx->name_count; i++) {
+                int word = AUDIT_WORD(ctx->major);
+                int bit  = AUDIT_BIT(ctx->major);
+                struct audit_names *n = &ctx->names[i];
+                int h = audit_hash_ino((u32)n->ino);
+                struct list_head *list = &audit_inode_hash[h];
-        list_for_each_entry(n, &ctx->names_list, list) {
+                if (list_empty(list))
-                if (audit_filter_inode_name(tsk, n, ctx))
+                        continue;
-                        break;
+                list_for_each_entry_rcu(e, list, list) {
+                        if ((e->rule.mask[word] & bit) == bit &&
+                            audit_filter_rules(tsk, &e->rule, ctx, n,
+                                               &state, false)) {
+                                rcu_read_unlock();
+                                ctx->current_state = state;
+                                return;
+                        }
+                }
        }
        rcu_read_unlock();
 }
@@ -950,7 +766,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 {
        struct audit_context *context = tsk->audit_context;
-        if (!context)
+        if (likely(!context))
                return NULL;
        context->return_valid = return_valid;
@@ -983,7 +799,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 static inline void audit_free_names(struct audit_context *context)
 {
-        struct audit_names *n, *next;
+        int i;
 #if AUDIT_DEBUG == 2
        if (context->put_count + context->ino_count != context->name_count) {
@@ -994,9 +810,10 @@ static inline void audit_free_names(struct audit_context *context)
                       context->serial, context->major, context->in_syscall,
                       context->name_count, context->put_count,
                       context->ino_count);
-                list_for_each_entry(n, &context->names_list, list) {
+                for (i = 0; i < context->name_count; i++) {
                        printk(KERN_ERR "names[%d] = %p = %s\n", i,
-                               n->name, n->name->name ?: "(null)");
+                               context->names[i].name,
+                               context->names[i].name ?: "(null)");
                }
                dump_stack();
                return;
@@ -1007,12 +824,9 @@ static inline void audit_free_names(struct audit_context *context)
        context->ino_count  = 0;
 #endif
-        list_for_each_entry_safe(n, next, &context->names_list, list) {
+        for (i = 0; i < context->name_count; i++) {
-                list_del(&n->list);
+                if (context->names[i].name && context->names[i].name_put)
-                if (n->name && n->name_put)
+                        __putname(context->names[i].name);
-                        __putname(n->name);
-                if (n->should_free)
-                        kfree(n);
        }
        context->name_count = 0;
        path_put(&context->pwd);
@@ -1050,7 +864,6 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
                return NULL;
        audit_zero_context(context, state);
        INIT_LIST_HEAD(&context->killed_trees);
-        INIT_LIST_HEAD(&context->names_list);
        return context;
 }
@@ -1073,7 +886,7 @@ int audit_alloc(struct task_struct *tsk)
                return 0; /* Return if not auditing. */
        state = audit_filter_task(tsk, &key);
-        if (state == AUDIT_DISABLED)
+        if (likely(state == AUDIT_DISABLED))
                return 0;
        if (!(context = audit_alloc_context(state))) {
@@ -1090,13 +903,29 @@ int audit_alloc(struct task_struct *tsk)
 static inline void audit_free_context(struct audit_context *context)
 {
-        audit_free_names(context);
+        struct audit_context *previous;
-        unroll_tree_refs(context, NULL, 0);
+        int                  count = 0;
-        free_tree_refs(context);
-        audit_free_aux(context);
+        do {
-        kfree(context->filterkey);
+                previous = context->previous;
-        kfree(context->sockaddr);
+                if (previous || (count &&  count < 10)) {
-        kfree(context);
+                        ++count;
+                        printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
+                               " freeing multiple contexts (%d)\n",
+                               context->serial, context->major,
+                               context->name_count, count);
+                }
+                audit_free_names(context);
+                unroll_tree_refs(context, NULL, 0);
+                free_tree_refs(context);
+                audit_free_aux(context);
+                kfree(context->filterkey);
+                kfree(context->sockaddr);
+                kfree(context);
+                context  = previous;
+        } while (context);
+        if (count >= 10)
+                printk(KERN_ERR "audit: freed %d contexts\n", count);
 }
 void audit_log_task_context(struct audit_buffer *ab)
@@ -1128,43 +957,13 @@ error_path:
 EXPORT_SYMBOL(audit_log_task_context);
-void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
-        const struct cred *cred;
        char name[sizeof(tsk->comm)];
        struct mm_struct *mm = tsk->mm;
-        char *tty;
+        struct vm_area_struct *vma;
-        if (!ab)
-                return;
        /* tsk == current */
-        cred = current_cred();
-        spin_lock_irq(&tsk->sighand->siglock);
-        if (tsk->signal && tsk->signal->tty)
-                tty = tsk->signal->tty->name;
-        else
-                tty = "(none)";
-        spin_unlock_irq(&tsk->sighand->siglock);
-        audit_log_format(ab,
-                         " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
-                         " euid=%u suid=%u fsuid=%u"
-                         " egid=%u sgid=%u fsgid=%u ses=%u tty=%s",
-                         sys_getppid(),
-                         tsk->pid,
-                         from_kuid(&init_user_ns, tsk->loginuid),
-                         from_kuid(&init_user_ns, cred->uid),
-                         from_kgid(&init_user_ns, cred->gid),
-                         from_kuid(&init_user_ns, cred->euid),
-                         from_kuid(&init_user_ns, cred->suid),
-                         from_kuid(&init_user_ns, cred->fsuid),
-                         from_kgid(&init_user_ns, cred->egid),
-                         from_kgid(&init_user_ns, cred->sgid),
-                         from_kgid(&init_user_ns, cred->fsgid),
-                         tsk->sessionid, tty);
        get_task_comm(name, tsk);
        audit_log_format(ab, " comm=");
@@ -1172,17 +971,23 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
        if (mm) {
                down_read(&mm->mmap_sem);
-                if (mm->exe_file)
+                vma = mm->mmap;
-                        audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
+                while (vma) {
+                        if ((vma->vm_flags & VM_EXECUTABLE) &&
+                            vma->vm_file) {
+                                audit_log_d_path(ab, "exe=",
+                                                 &vma->vm_file->f_path);
+                                break;
+                        }
+                        vma = vma->vm_next;
+                }
                up_read(&mm->mmap_sem);
        }
        audit_log_task_context(ab);
 }
-EXPORT_SYMBOL(audit_log_task_info);
 static int audit_log_pid_context(struct audit_context *context, pid_t pid,
-                                 kuid_t auid, kuid_t uid, unsigned int sessionid,
+                                 uid_t auid, uid_t uid, unsigned int sessionid,
                                 u32 sid, char *comm)
 {
        struct audit_buffer *ab;
@@ -1194,9 +999,8 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
        if (!ab)
                return rc;
-        audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
+        audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid,
-                         from_kuid(&init_user_ns, auid),
+                         uid, sessionid);
-                         from_kuid(&init_user_ns, uid), sessionid);
        if (security_secid_to_secctx(sid, &ctx, &len)) {
                audit_log_format(ab, " obj=(none)");
                rc = 1;
@@ -1362,8 +1166,8 @@ static void audit_log_execve_info(struct audit_context *context,
                                  struct audit_buffer **ab,
                                  struct audit_aux_data_execve *axi)
 {
-        int i, len;
+        int i;
-        size_t len_sent = 0;
+        size_t len, len_sent = 0;
        const char __user *p;
        char *buf;
@@ -1445,10 +1249,8 @@ static void show_special(struct audit_context *context, int *call_panic)
        case AUDIT_IPC: {
                u32 osid = context->ipc.osid;
-                audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
+                audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
-                                 from_kuid(&init_user_ns, context->ipc.uid),
+                         context->ipc.uid, context->ipc.gid, context->ipc.mode);
-                                 from_kgid(&init_user_ns, context->ipc.gid),
-                                 context->ipc.mode);
                if (osid) {
                        char *ctx = NULL;
                        u32 len;
@@ -1464,19 +1266,19 @@ static void show_special(struct audit_context *context, int *call_panic)
                        audit_log_end(ab);
                        ab = audit_log_start(context, GFP_KERNEL,
                                             AUDIT_IPC_SET_PERM);
-                        if (unlikely(!ab))
-                                return;
                        audit_log_format(ab,
-                                "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
+                                "qbytes=%lx ouid=%u ogid=%u mode=%#o",
                                context->ipc.qbytes,
                                context->ipc.perm_uid,
                                context->ipc.perm_gid,
                                context->ipc.perm_mode);
+                        if (!ab)
+                                return;
                }
                break; }
        case AUDIT_MQ_OPEN: {
                audit_log_format(ab,
-                        "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
+                        "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
                        "mq_msgsize=%ld mq_curmsgs=%ld",
                        context->mq_open.oflag, context->mq_open.mode,
                        context->mq_open.attr.mq_flags,
@@ -1522,76 +1324,27 @@ static void show_special(struct audit_context *context, int *call_panic)
        audit_log_end(ab);
 }
-static void audit_log_name(struct audit_context *context, struct audit_names *n,
-                           int record_num, int *call_panic)
-{
-        struct audit_buffer *ab;
-        ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
-        if (!ab)
-                return; /* audit_panic has been called */
-        audit_log_format(ab, "item=%d", record_num);
-        if (n->name) {
-                switch (n->name_len) {
-                case AUDIT_NAME_FULL:
-                        /* log the full path */
-                        audit_log_format(ab, " name=");
-                        audit_log_untrustedstring(ab, n->name->name);
-                        break;
-                case 0:
-                        /* name was specified as a relative path and the
-                         * directory component is the cwd */
-                        audit_log_d_path(ab, " name=", &context->pwd);
-                        break;
-                default:
-                        /* log the name's directory component */
-                        audit_log_format(ab, " name=");
-                        audit_log_n_untrustedstring(ab, n->name->name,
-                                                    n->name_len);
-                }
-        } else
-                audit_log_format(ab, " name=(null)");
-        if (n->ino != (unsigned long)-1) {
-                audit_log_format(ab, " inode=%lu"
-                                 " dev=%02x:%02x mode=%#ho"
-                                 " ouid=%u ogid=%u rdev=%02x:%02x",
-                                 n->ino,
-                                 MAJOR(n->dev),
-                                 MINOR(n->dev),
-                                 n->mode,
-                                 from_kuid(&init_user_ns, n->uid),
-                                 from_kgid(&init_user_ns, n->gid),
-                                 MAJOR(n->rdev),
-                                 MINOR(n->rdev));
-        }
-        if (n->osid != 0) {
-                char *ctx = NULL;
-                u32 len;
-                if (security_secid_to_secctx(
-                        n->osid, &ctx, &len)) {
-                        audit_log_format(ab, " osid=%u", n->osid);
-                        *call_panic = 2;
-                } else {
-                        audit_log_format(ab, " obj=%s", ctx);
-                        security_release_secctx(ctx, len);
-                }
-        }
-        audit_log_fcaps(ab, n);
-        audit_log_end(ab);
-}
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
+        const struct cred *cred;
        int i, call_panic = 0;
        struct audit_buffer *ab;
        struct audit_aux_data *aux;
-        struct audit_names *n;
+        const char *tty;
        /* tsk == current */
+        context->pid = tsk->pid;
+        if (!context->ppid)
+                context->ppid = sys_getppid();
+        cred = current_cred();
+        context->uid   = cred->uid;
+        context->gid   = cred->gid;
+        context->euid  = cred->euid;
+        context->suid  = cred->suid;
+        context->fsuid = cred->fsuid;
+        context->egid  = cred->egid;
+        context->sgid  = cred->sgid;
+        context->fsgid = cred->fsgid;
        context->personality = tsk->personality;
        ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1606,13 +1359,32 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                                 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
                                 context->return_code);
+        spin_lock_irq(&tsk->sighand->siglock);
+        if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+                tty = tsk->signal->tty->name;
+        else
+                tty = "(none)";
+        spin_unlock_irq(&tsk->sighand->siglock);
        audit_log_format(ab,
-                         " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
+                  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
-                         context->argv[0],
+                  " ppid=%d pid=%d auid=%u uid=%u gid=%u"
-                         context->argv[1],
+                  " euid=%u suid=%u fsuid=%u"
-                         context->argv[2],
+                  " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
-                         context->argv[3],
+                  context->argv[0],
-                         context->name_count);
+                  context->argv[1],
+                  context->argv[2],
+                  context->argv[3],
+                  context->name_count,
+                  context->ppid,
+                  context->pid,
+                  tsk->loginuid,
+                  context->uid,
+                  context->gid,
+                  context->euid, context->suid, context->fsuid,
+                  context->egid, context->sgid, context->fsgid, tty,
+                  tsk->sessionid);
        audit_log_task_info(ab, tsk);
        audit_log_key(ab, context->filterkey);
@@ -1694,14 +1466,70 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
        if (context->pwd.dentry && context->pwd.mnt) {
                ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
                if (ab) {
-                        audit_log_d_path(ab, " cwd=", &context->pwd);
+                        audit_log_d_path(ab, "cwd=", &context->pwd);
                        audit_log_end(ab);
                }
        }
+        for (i = 0; i < context->name_count; i++) {
+                struct audit_names *n = &context->names[i];
+                ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+                if (!ab)
+                        continue; /* audit_panic has been called */
+                audit_log_format(ab, "item=%d", i);
+                if (n->name) {
+                        switch(n->name_len) {
+                        case AUDIT_NAME_FULL:
+                                /* log the full path */
+                                audit_log_format(ab, " name=");
+                                audit_log_untrustedstring(ab, n->name);
+                                break;
+                        case 0:
+                                /* name was specified as a relative path and the
+                                 * directory component is the cwd */
+                                audit_log_d_path(ab, "name=", &context->pwd);
+                                break;
+                        default:
+                                /* log the name's directory component */
+                                audit_log_format(ab, " name=");
+                                audit_log_n_untrustedstring(ab, n->name,
+                                                            n->name_len);
+                        }
+                } else
+                        audit_log_format(ab, " name=(null)");
+                if (n->ino != (unsigned long)-1) {
+                        audit_log_format(ab, " inode=%lu"
+                                         " dev=%02x:%02x mode=%#o"
+                                         " ouid=%u ogid=%u rdev=%02x:%02x",
+                                         n->ino,
+                                         MAJOR(n->dev),
+                                         MINOR(n->dev),
+                                         n->mode,
+                                         n->uid,
+                                         n->gid,
+                                         MAJOR(n->rdev),
+                                         MINOR(n->rdev));
+                }
+                if (n->osid != 0) {
+                        char *ctx = NULL;
+                        u32 len;
+                        if (security_secid_to_secctx(
+                                n->osid, &ctx, &len)) {
+                                audit_log_format(ab, " osid=%u", n->osid);
+                                call_panic = 2;
+                        } else {
+                                audit_log_format(ab, " obj=%s", ctx);
+                                security_release_secctx(ctx, len);
+                        }
+                }
+                audit_log_fcaps(ab, n);
-        i = 0;
+                audit_log_end(ab);
-        list_for_each_entry(n, &context->names_list, list)
+        }
-                audit_log_name(context, n, i++, &call_panic);
        /* Send end of event record to help user space know we are finished */
        ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1717,12 +1545,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 *
 * Called from copy_process and do_exit
 */
-void __audit_free(struct task_struct *tsk)
+void audit_free(struct task_struct *tsk)
 {
        struct audit_context *context;
        context = audit_get_context(tsk, 0, 0);
-        if (!context)
+        if (likely(!context))
                return;
        /* Check for system calls that do not go through the exit
@@ -1755,7 +1583,7 @@ void __audit_free(struct task_struct *tsk)
 * will only be written if another part of the kernel requests that it
 * be written).
 */
-void __audit_syscall_entry(int arch, int major,
+void audit_syscall_entry(int arch, int major,
                         unsigned long a1, unsigned long a2,
                         unsigned long a3, unsigned long a4)
 {
@@ -1763,9 +1591,45 @@ void __audit_syscall_entry(int arch, int major,
        struct audit_context *context = tsk->audit_context;
        enum audit_state     state;
-        if (!context)
+        if (unlikely(!context))
                return;
+        /*
+         * This happens only on certain architectures that make system
+         * calls in kernel_thread via the entry.S interface, instead of
+         * with direct calls.  (If you are porting to a new
+         * architecture, hitting this condition can indicate that you
+         * got the _exit/_leave calls backward in entry.S.)
+         *
+         * i386     no
+         * x86_64   no
+         * ppc64    yes (see arch/powerpc/platforms/iseries/misc.S)
+         *
+         * This also happens with vm86 emulation in a non-nested manner
+         * (entries without exits), so this case must be caught.
+         */
+        if (context->in_syscall) {
+                struct audit_context *newctx;
+#if AUDIT_DEBUG
+                printk(KERN_ERR
+                       "audit(:%d) pid=%d in syscall=%d;"
+                       " entering syscall=%d\n",
+                       context->serial, tsk->pid, context->major, major);
+#endif
+                newctx = audit_alloc_context(context->state);
+                if (newctx) {
+                        newctx->previous   = context;
+                        context            = newctx;
+                        tsk->audit_context = newctx;
+                } else  {
+                        /* If we can't alloc a new context, the best we
+                         * can do is to leak memory (any pending putname
+                         * will be lost).  The only other alternative is
+                         * to abandon auditing. */
+                        audit_zero_context(context, context->state);
+                }
+        }
        BUG_ON(context->in_syscall || context->name_count);
        if (!audit_enabled)
@@ -1784,7 +1648,7 @@ void __audit_syscall_entry(int arch, int major,
                context->prio = 0;
                state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
        }
-        if (state == AUDIT_DISABLED)
+        if (likely(state == AUDIT_DISABLED))
                return;
        context->serial     = 0;
@@ -1794,29 +1658,45 @@ void __audit_syscall_entry(int arch, int major,
        context->ppid       = 0;
 }
+void audit_finish_fork(struct task_struct *child)
+{
+        struct audit_context *ctx = current->audit_context;
+        struct audit_context *p = child->audit_context;
+        if (!p || !ctx)
+                return;
+        if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
+                return;
+        p->arch = ctx->arch;
+        p->major = ctx->major;
+        memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
+        p->ctime = ctx->ctime;
+        p->dummy = ctx->dummy;
+        p->in_syscall = ctx->in_syscall;
+        p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
+        p->ppid = current->pid;
+        p->prio = ctx->prio;
+        p->current_state = ctx->current_state;
+}
 /**
 * audit_syscall_exit - deallocate audit context after a system call
- * @success: success value of the syscall
+ * @valid: success/failure flag
- * @return_code: return value of the syscall
+ * @return_code: syscall return value
 *
 * Tear down after system call.  If the audit context has been marked as
 * auditable (either because of the AUDIT_RECORD_CONTEXT state from
- * filtering, or because some other part of the kernel wrote an audit
+ * filtering, or because some other part of the kernel write an audit
 * message), then write out the syscall information.  In call cases,
 * free the names stored from getname().
 */
-void __audit_syscall_exit(int success, long return_code)
+void audit_syscall_exit(int valid, long return_code)
 {
        struct task_struct *tsk = current;
        struct audit_context *context;
-        if (success)
+        context = audit_get_context(tsk, valid, return_code);
-                success = AUDITSC_SUCCESS;
-        else
-                success = AUDITSC_FAILURE;
-        context = audit_get_context(tsk, success, return_code);
+        if (likely(!context))
-        if (!context)
                return;
        if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
@@ -1828,21 +1708,28 @@ void __audit_syscall_exit(int success, long return_code)
        if (!list_empty(&context->killed_trees))
                audit_kill_trees(&context->killed_trees);
-        audit_free_names(context);
+        if (context->previous) {
-        unroll_tree_refs(context, NULL, 0);
+                struct audit_context *new_context = context->previous;
-        audit_free_aux(context);
+                context->previous  = NULL;
-        context->aux = NULL;
+                audit_free_context(context);
-        context->aux_pids = NULL;
+                tsk->audit_context = new_context;
-        context->target_pid = 0;
+        } else {
-        context->target_sid = 0;
+                audit_free_names(context);
-        context->sockaddr_len = 0;
+                unroll_tree_refs(context, NULL, 0);
-        context->type = 0;
+                audit_free_aux(context);
-        context->fds[0] = -1;
+                context->aux = NULL;
-        if (context->state != AUDIT_RECORD_CONTEXT) {
+                context->aux_pids = NULL;
-                kfree(context->filterkey);
+                context->target_pid = 0;
-                context->filterkey = NULL;
+                context->target_sid = 0;
+                context->sockaddr_len = 0;
+                context->type = 0;
+                context->fds[0] = -1;
+                if (context->state != AUDIT_RECORD_CONTEXT) {
+                        kfree(context->filterkey);
+                        context->filterkey = NULL;
+                }
+                tsk->audit_context = context;
        }
-        tsk->audit_context = context;
 }
 static inline void handle_one(const struct inode *inode)
@@ -1934,55 +1821,6 @@ retry:
 #endif
 }
-static struct audit_names *audit_alloc_name(struct audit_context *context,
-                                                unsigned char type)
-{
-        struct audit_names *aname;
-        if (context->name_count < AUDIT_NAMES) {
-                aname = &context->preallocated_names[context->name_count];
-                memset(aname, 0, sizeof(*aname));
-        } else {
-                aname = kzalloc(sizeof(*aname), GFP_NOFS);
-                if (!aname)
-                        return NULL;
-                aname->should_free = true;
-        }
-        aname->ino = (unsigned long)-1;
-        aname->type = type;
-        list_add_tail(&aname->list, &context->names_list);
-        context->name_count++;
-#if AUDIT_DEBUG
-        context->ino_count++;
-#endif
-        return aname;
-}
-/**
- * audit_reusename - fill out filename with info from existing entry
- * @uptr: userland ptr to pathname
- *
- * Search the audit_names list for the current audit context. If there is an
- * existing entry with a matching "uptr" then return the filename
- * associated with that audit_name. If not, return NULL.
- */
-struct filename *
-__audit_reusename(const __user char *uptr)
-{
-        struct audit_context *context = current->audit_context;
-        struct audit_names *n;
-        list_for_each_entry(n, &context->names_list, list) {
-                if (!n->name)
-                        continue;
-                if (n->name->uptr == uptr)
-                        return n->name;
-        }
-        return NULL;
-}
 /**
 * audit_getname - add a name to the list
 * @name: name to add
@@ -1990,10 +1828,12 @@ __audit_reusename(const __user char *uptr)
 * Add a name to the list of audit names for this context.
 * Called from fs/namei.c:getname().
 */
-void __audit_getname(struct filename *name)
+void __audit_getname(const char *name)
 {
        struct audit_context *context = current->audit_context;
-        struct audit_names *n;
+        if (IS_ERR(name) || !name)
+                return;
        if (!context->in_syscall) {
 #if AUDIT_DEBUG == 2
@@ -2003,21 +1843,13 @@ void __audit_getname(struct filename *name)
 #endif
                return;
        }
+        BUG_ON(context->name_count >= AUDIT_NAMES);
-#if AUDIT_DEBUG
+        context->names[context->name_count].name = name;
-        /* The filename _must_ have a populated ->name */
+        context->names[context->name_count].name_len = AUDIT_NAME_FULL;
-        BUG_ON(!name->name);
+        context->names[context->name_count].name_put = 1;
-#endif
+        context->names[context->name_count].ino  = (unsigned long)-1;
+        context->names[context->name_count].osid = 0;
-        n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
+        ++context->name_count;
-        if (!n)
-                return;
-        n->name = name;
-        n->name_len = AUDIT_NAME_FULL;
-        n->name_put = true;
-        name->aname = n;
        if (!context->pwd.dentry)
                get_fs_pwd(current->fs, &context->pwd);
 }
@@ -2029,7 +1861,7 @@ void __audit_getname(struct filename *name)
 * then we delay the putname until syscall exit.
 * Called from include/linux/fs.h:putname().
 */
-void audit_putname(struct filename *name)
+void audit_putname(const char *name)
 {
        struct audit_context *context = current->audit_context;
@@ -2039,13 +1871,12 @@ void audit_putname(struct filename *name)
                printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
                       __FILE__, __LINE__, context->serial, name);
                if (context->name_count) {
-                        struct audit_names *n;
                        int i;
+                        for (i = 0; i < context->name_count; i++)
-                        list_for_each_entry(n, &context->names_list, list)
                                printk(KERN_ERR "name[%d] = %p = %s\n", i,
-                                       n->name, n->name->name ?: "(null)");
+                                       context->names[i].name,
-                        }
+                                       context->names[i].name ?: "(null)");
+                }
 #endif
                __putname(name);
        }
@@ -2058,19 +1889,47 @@ void audit_putname(struct filename *name)
                               " put_count=%d\n",
                               __FILE__, __LINE__,
                               context->serial, context->major,
-                               context->in_syscall, name->name,
+                               context->in_syscall, name, context->name_count,
-                               context->name_count, context->put_count);
+                               context->put_count);
                        dump_stack();
                }
        }
 #endif
 }
+static int audit_inc_name_count(struct audit_context *context,
+                                const struct inode *inode)
+{
+        if (context->name_count >= AUDIT_NAMES) {
+                if (inode)
+                        printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
+                               "dev=%02x:%02x, inode=%lu\n",
+                               MAJOR(inode->i_sb->s_dev),
+                               MINOR(inode->i_sb->s_dev),
+                               inode->i_ino);
+                else
+                        printk(KERN_DEBUG "name_count maxed, losing inode data\n");
+                return 1;
+        }
+        context->name_count++;
+#if AUDIT_DEBUG
+        context->ino_count++;
+#endif
+        return 0;
+}
 static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
 {
        struct cpu_vfs_cap_data caps;
        int rc;
+        memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
+        memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
+        name->fcap.fE = 0;
+        name->fcap_ver = 0;
        if (!dentry)
                return 0;
@@ -2102,84 +1961,44 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
 }
 /**
- * __audit_inode - store the inode and device from a lookup
+ * audit_inode - store the inode and device from a lookup
 * @name: name being audited
 * @dentry: dentry being audited
- * @parent: does this dentry represent the parent?
+ *
+ * Called from fs/namei.c:path_lookup().
 */
-void __audit_inode(struct filename *name, const struct dentry *dentry,
+void __audit_inode(const char *name, const struct dentry *dentry)
-                   unsigned int parent)
 {
+        int idx;
        struct audit_context *context = current->audit_context;
        const struct inode *inode = dentry->d_inode;
-        struct audit_names *n;
        if (!context->in_syscall)
                return;
+        if (context->name_count
-        if (!name)
+            && context->names[context->name_count-1].name
-                goto out_alloc;
+            && context->names[context->name_count-1].name == name)
+                idx = context->name_count - 1;
-#if AUDIT_DEBUG
+        else if (context->name_count > 1
-        /* The struct filename _must_ have a populated ->name */
+                 && context->names[context->name_count-2].name
-        BUG_ON(!name->name);
+                 && context->names[context->name_count-2].name == name)
-#endif
+                idx = context->name_count - 2;
-        /*
+        else {
-         * If we have a pointer to an audit_names entry already, then we can
+                /* FIXME: how much do we care about inodes that have no
-         * just use it directly if the type is correct.
+                 * associated name? */
-         */
+                if (audit_inc_name_count(context, inode))
-        n = name->aname;
+                        return;
-        if (n) {
+                idx = context->name_count - 1;
-                if (parent) {
+                context->names[idx].name = NULL;
-                        if (n->type == AUDIT_TYPE_PARENT ||
-                            n->type == AUDIT_TYPE_UNKNOWN)
-                                goto out;
-                } else {
-                        if (n->type != AUDIT_TYPE_PARENT)
-                                goto out;
-                }
-        }
-        list_for_each_entry_reverse(n, &context->names_list, list) {
-                /* does the name pointer match? */
-                if (!n->name || n->name->name != name->name)
-                        continue;
-                /* match the correct record type */
-                if (parent) {
-                        if (n->type == AUDIT_TYPE_PARENT ||
-                            n->type == AUDIT_TYPE_UNKNOWN)
-                                goto out;
-                } else {
-                        if (n->type != AUDIT_TYPE_PARENT)
-                                goto out;
-                }
-        }
-out_alloc:
-        /* unable to find the name from a previous getname(). Allocate a new
-         * anonymous entry.
-         */
-        n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);
-        if (!n)
-                return;
-out:
-        if (parent) {
-                n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
-                n->type = AUDIT_TYPE_PARENT;
-        } else {
-                n->name_len = AUDIT_NAME_FULL;
-                n->type = AUDIT_TYPE_NORMAL;
        }
        handle_path(dentry);
-        audit_copy_inode(n, dentry, inode);
+        audit_copy_inode(&context->names[idx], dentry, inode);
 }
 /**
- * __audit_inode_child - collect inode info for created/removed objects
+ * audit_inode_child - collect inode info for created/removed objects
- * @parent: inode of dentry parent
 * @dentry: dentry being audited
- * @type:   AUDIT_TYPE_* value that we're looking for
+ * @parent: inode of dentry parent
 *
 * For syscalls that create or remove filesystem objects, audit_inode
 * can only collect information for the filesystem object's parent.
@@ -2189,14 +2008,15 @@ out:
 * must be hooked prior, in order to capture the target inode during
 * unsuccessful attempts.
 */
-void __audit_inode_child(const struct inode *parent,
+void __audit_inode_child(const struct dentry *dentry,
-                         const struct dentry *dentry,
+                         const struct inode *parent)
-                         const unsigned char type)
 {
+        int idx;
        struct audit_context *context = current->audit_context;
+        const char *found_parent = NULL, *found_child = NULL;
        const struct inode *inode = dentry->d_inode;
        const char *dname = dentry->d_name.name;
-        struct audit_names *n, *found_parent = NULL, *found_child = NULL;
+        int dirlen = 0;
        if (!context->in_syscall)
                return;
@@ -2204,65 +2024,71 @@ void __audit_inode_child(const struct inode *parent,
        if (inode)
                handle_one(inode);
-        /* look for a parent entry first */
+        /* parent is more likely, look for it first */
-        list_for_each_entry(n, &context->names_list, list) {
+        for (idx = 0; idx < context->name_count; idx++) {
-                if (!n->name || n->type != AUDIT_TYPE_PARENT)
+                struct audit_names *n = &context->names[idx];
+                if (!n->name)
                        continue;
                if (n->ino == parent->i_ino &&
-                    !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
+                    !audit_compare_dname_path(dname, n->name, &dirlen)) {
-                        found_parent = n;
+                        n->name_len = dirlen; /* update parent data in place */
-                        break;
+                        found_parent = n->name;
+                        goto add_names;
                }
        }
-        /* is there a matching child entry? */
+        /* no matching parent, look for matching child */
-        list_for_each_entry(n, &context->names_list, list) {
+        for (idx = 0; idx < context->name_count; idx++) {
-                /* can only match entries that have a name */
+                struct audit_names *n = &context->names[idx];
-                if (!n->name || n->type != type)
-                        continue;
-                /* if we found a parent, make sure this one is a child of it */
+                if (!n->name)
-                if (found_parent && (n->name != found_parent->name))
                        continue;
-                if (!strcmp(dname, n->name->name) ||
+                /* strcmp() is the more likely scenario */
-                    !audit_compare_dname_path(dname, n->name->name,
+                if (!strcmp(dname, n->name) ||
-                                                found_parent ?
+                     !audit_compare_dname_path(dname, n->name, &dirlen)) {
-                                                found_parent->name_len :
+                        if (inode)
-                                                AUDIT_NAME_FULL)) {
+                                audit_copy_inode(n, NULL, inode);
-                        found_child = n;
+                        else
-                        break;
+                                n->ino = (unsigned long)-1;
+                        found_child = n->name;
+                        goto add_names;
                }
        }
+add_names:
        if (!found_parent) {
-                /* create a new, "anonymous" parent record */
+                if (audit_inc_name_count(context, parent))
-                n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
-                if (!n)
                        return;
-                audit_copy_inode(n, NULL, parent);
+                idx = context->name_count - 1;
+                context->names[idx].name = NULL;
+                audit_copy_inode(&context->names[idx], NULL, parent);
        }
        if (!found_child) {
-                found_child = audit_alloc_name(context, type);
+                if (audit_inc_name_count(context, inode))
-                if (!found_child)
                        return;
+                idx = context->name_count - 1;
                /* Re-use the name belonging to the slot for a matching parent
                 * directory. All names for this context are relinquished in
                 * audit_free_names() */
                if (found_parent) {
-                        found_child->name = found_parent->name;
+                        context->names[idx].name = found_parent;
-                        found_child->name_len = AUDIT_NAME_FULL;
+                        context->names[idx].name_len = AUDIT_NAME_FULL;
                        /* don't call __putname() */
-                        found_child->name_put = false;
+                        context->names[idx].name_put = 0;
+                } else {
+                        context->names[idx].name = NULL;
                }
+                if (inode)
+                        audit_copy_inode(&context->names[idx], NULL, inode);
+                else
+                        context->names[idx].ino = (unsigned long)-1;
        }
-        if (inode)
-                audit_copy_inode(found_child, dentry, inode);
-        else
-                found_child->ino = (unsigned long)-1;
 }
 EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2295,28 +2121,19 @@ int auditsc_get_stamp(struct audit_context *ctx,
 static atomic_t session_id = ATOMIC_INIT(0);
 /**
- * audit_set_loginuid - set current task's audit_context loginuid
+ * audit_set_loginuid - set a task's audit_context loginuid
+ * @task: task whose audit context is being modified
 * @loginuid: loginuid value
 *
 * Returns 0.
 *
 * Called (set) from fs/proc/base.c::proc_loginuid_write().
 */
-int audit_set_loginuid(kuid_t loginuid)
+int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 {
-        struct task_struct *task = current;
+        unsigned int sessionid = atomic_inc_return(&session_id);
        struct audit_context *context = task->audit_context;
-        unsigned int sessionid;
-#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
-        if (uid_valid(task->loginuid))
-                return -EPERM;
-#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
-        if (!capable(CAP_AUDIT_CONTROL))
-                return -EPERM;
-#endif  /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
-        sessionid = atomic_inc_return(&session_id);
        if (context && context->in_syscall) {
                struct audit_buffer *ab;
@@ -2325,10 +2142,8 @@ int audit_set_loginuid(kuid_t loginuid)
                        audit_log_format(ab, "login pid=%d uid=%u "
                                "old auid=%u new auid=%u"
                                " old ses=%u new ses=%u",
-                                task->pid,
+                                task->pid, task_uid(task),
-                                from_kuid(&init_user_ns, task_uid(task)),
+                                task->loginuid, loginuid,
-                                from_kuid(&init_user_ns, task->loginuid),
-                                from_kuid(&init_user_ns, loginuid),
                                task->sessionid, sessionid);
                        audit_log_end(ab);
                }
@@ -2345,7 +2160,7 @@ int audit_set_loginuid(kuid_t loginuid)
 * @attr: queue attributes
 *
 */
-void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
+void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
 {
        struct audit_context *context = current->audit_context;
@@ -2445,7 +2260,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
 *
 * Called only after audit_ipc_obj().
 */
-void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
+void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 {
        struct audit_context *context = current->audit_context;
@@ -2456,11 +2271,14 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
        context->ipc.has_perm = 1;
 }
-int __audit_bprm(struct linux_binprm *bprm)
+int audit_bprm(struct linux_binprm *bprm)
 {
        struct audit_aux_data_execve *ax;
        struct audit_context *context = current->audit_context;
+        if (likely(!audit_enabled || !context || context->dummy))
+                return 0;
        ax = kmalloc(sizeof(*ax), GFP_KERNEL);
        if (!ax)
                return -ENOMEM;
@@ -2481,10 +2299,13 @@ int __audit_bprm(struct linux_binprm *bprm)
 * @args: args array
 *
 */
-void __audit_socketcall(int nargs, unsigned long *args)
+void audit_socketcall(int nargs, unsigned long *args)
 {
        struct audit_context *context = current->audit_context;
+        if (likely(!context || context->dummy))
+                return;
        context->type = AUDIT_SOCKETCALL;
        context->socketcall.nargs = nargs;
        memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
@@ -2510,10 +2331,13 @@ void __audit_fd_pair(int fd1, int fd2)
 *
 * Returns 0 for success or NULL context or < 0 on error.
 */
-int __audit_sockaddr(int len, void *a)
+int audit_sockaddr(int len, void *a)
 {
        struct audit_context *context = current->audit_context;
+        if (likely(!context || context->dummy))
+                return 0;
        if (!context->sockaddr) {
                void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
                if (!p)
@@ -2551,12 +2375,12 @@ int __audit_signal_info(int sig, struct task_struct *t)
        struct audit_aux_data_pids *axp;
        struct task_struct *tsk = current;
        struct audit_context *ctx = tsk->audit_context;
-        kuid_t uid = current_uid(), t_uid = task_uid(t);
+        uid_t uid = current_uid(), t_uid = task_uid(t);
        if (audit_pid && t->tgid == audit_pid) {
                if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
                        audit_sig_pid = tsk->pid;
-                        if (uid_valid(tsk->loginuid))
+                        if (tsk->loginuid != -1)
                                audit_sig_uid = tsk->loginuid;
                        else
                                audit_sig_uid = uid;
@@ -2675,33 +2499,6 @@ void __audit_mmap_fd(int fd, int flags)
        context->type = AUDIT_MMAP;
 }
-static void audit_log_task(struct audit_buffer *ab)
-{
-        kuid_t auid, uid;
-        kgid_t gid;
-        unsigned int sessionid;
-        auid = audit_get_loginuid(current);
-        sessionid = audit_get_sessionid(current);
-        current_uid_gid(&uid, &gid);
-        audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
-                         from_kuid(&init_user_ns, auid),
-                         from_kuid(&init_user_ns, uid),
-                         from_kgid(&init_user_ns, gid),
-                         sessionid);
-        audit_log_task_context(ab);
-        audit_log_format(ab, " pid=%d comm=", current->pid);
-        audit_log_untrustedstring(ab, current->comm);
-}
-static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
-{
-        audit_log_task(ab);
-        audit_log_format(ab, " reason=");
-        audit_log_string(ab, reason);
-        audit_log_format(ab, " sig=%ld", signr);
-}
 /**
 * audit_core_dumps - record information about processes that end abnormally
 * @signr: signal value
@@ -2712,6 +2509,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
 void audit_core_dumps(long signr)
 {
        struct audit_buffer *ab;
+        u32 sid;
+        uid_t auid = audit_get_loginuid(current), uid;
+        gid_t gid;
+        unsigned int sessionid = audit_get_sessionid(current);
        if (!audit_enabled)
                return;
@@ -2720,25 +2521,24 @@ void audit_core_dumps(long signr)
                return;
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
-        if (unlikely(!ab))
+        current_uid_gid(&uid, &gid);
-                return;
+        audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
-        audit_log_abend(ab, "memory violation", signr);
+                         auid, uid, gid, sessionid);
-        audit_log_end(ab);
+        security_task_getsecid(current, &sid);
-}
+        if (sid) {
+                char *ctx = NULL;
-void __audit_seccomp(unsigned long syscall, long signr, int code)
+                u32 len;
-{
-        struct audit_buffer *ab;
-        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
+                if (security_secid_to_secctx(sid, &ctx, &len))
-        if (unlikely(!ab))
+                        audit_log_format(ab, " ssid=%u", sid);
-                return;
+                else {
-        audit_log_task(ab);
+                        audit_log_format(ab, " subj=%s", ctx);
+                        security_release_secctx(ctx, len);
+                }
+        }
+        audit_log_format(ab, " pid=%d comm=", current->pid);
+        audit_log_untrustedstring(ab, current->comm);
        audit_log_format(ab, " sig=%ld", signr);
-        audit_log_format(ab, " syscall=%ld", syscall);
-        audit_log_format(ab, " compat=%d", is_compat_task());
-        audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
-        audit_log_format(ab, " code=0x%x", code);
        audit_log_end(ab);
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d9725948..283c529f8b1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -10,7 +10,7 @@
 #include <linux/audit.h>
 #include <linux/capability.h>
 #include <linux/mm.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
@@ -287,84 +287,74 @@ error:
 }
 /**
- * has_ns_capability - Does a task have a capability in a specific user ns
+ * has_capability - Does a task have a capability in init_user_ns
 * @t: The task in question
- * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
- * currently in effect to the specified user namespace, false if not.
+ * currently in effect to the initial user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
-bool has_ns_capability(struct task_struct *t,
+bool has_capability(struct task_struct *t, int cap)
-                       struct user_namespace *ns, int cap)
 {
-        int ret;
+        int ret = security_real_capable(t, &init_user_ns, cap);
-        rcu_read_lock();
-        ret = security_capable(__task_cred(t), ns, cap);
-        rcu_read_unlock();
        return (ret == 0);
 }
 /**
- * has_capability - Does a task have a capability in init_user_ns
+ * has_capability - Does a task have a capability in a specific user ns
 * @t: The task in question
+ * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
- * currently in effect to the initial user namespace, false if not.
+ * currently in effect to the specified user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
-bool has_capability(struct task_struct *t, int cap)
+bool has_ns_capability(struct task_struct *t,
+                       struct user_namespace *ns, int cap)
 {
-        return has_ns_capability(t, &init_user_ns, cap);
+        int ret = security_real_capable(t, ns, cap);
+        return (ret == 0);
 }
 /**
- * has_ns_capability_noaudit - Does a task have a capability (unaudited)
+ * has_capability_noaudit - Does a task have a capability (unaudited)
- * in a specific user ns.
 * @t: The task in question
- * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
- * currently in effect to the specified user namespace, false if not.
+ * currently in effect to init_user_ns, false if not.  Don't write an
- * Do not write an audit message for the check.
+ * audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
-bool has_ns_capability_noaudit(struct task_struct *t,
+bool has_capability_noaudit(struct task_struct *t, int cap)
-                               struct user_namespace *ns, int cap)
 {
-        int ret;
+        int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
-        rcu_read_lock();
-        ret = security_capable_noaudit(__task_cred(t), ns, cap);
-        rcu_read_unlock();
        return (ret == 0);
 }
 /**
- * has_capability_noaudit - Does a task have a capability (unaudited) in the
+ * capable - Determine if the current task has a superior capability in effect
- * initial user ns
- * @t: The task in question
 * @cap: The capability to be tested for
 *
- * Return true if the specified task has the given superior capability
+ * Return true if the current task has the given superior capability currently
- * currently in effect to init_user_ns, false if not.  Don't write an
+ * available for use, false if not.
- * audit message for the check.
 *
- * Note that this does not set PF_SUPERPRIV on the task.
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
 */
-bool has_capability_noaudit(struct task_struct *t, int cap)
+bool capable(int cap)
 {
-        return has_ns_capability_noaudit(t, &init_user_ns, cap);
+        return ns_capable(&init_user_ns, cap);
 }
+EXPORT_SYMBOL(capable);
 /**
 * ns_capable - Determine if the current task has a superior capability in effect
@@ -384,7 +374,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
                BUG();
        }
-        if (security_capable(current_cred(), ns, cap) == 0) {
+        if (security_capable(ns, current_cred(), cap) == 0) {
                current->flags |= PF_SUPERPRIV;
                return true;
        }
@@ -393,20 +383,18 @@ bool ns_capable(struct user_namespace *ns, int cap)
 EXPORT_SYMBOL(ns_capable);
 /**
- * capable - Determine if the current task has a superior capability in effect
+ * task_ns_capable - Determine whether current task has a superior
- * @cap: The capability to be tested for
+ * capability targeted at a specific task's user namespace.
+ * @t: The task whose user namespace is targeted.
+ * @cap: The capability in question.
 *
- * Return true if the current task has the given superior capability currently
+ *  Return true if it does, false otherwise.
- * available for use, false if not.
- *
- * This sets PF_SUPERPRIV on the task if the capability is available on the
- * assumption that it's about to be used.
 */
-bool capable(int cap)
+bool task_ns_capable(struct task_struct *t, int cap)
 {
-        return ns_capable(&init_user_ns, cap);
+        return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
 }
-EXPORT_SYMBOL(capable);
+EXPORT_SYMBOL(task_ns_capable);
 /**
 * nsown_capable - Check superior capability to one's own user_ns
@@ -419,24 +407,3 @@ bool nsown_capable(int cap)
 {
        return ns_capable(current_user_ns(), cap);
 }
-/**
- * inode_capable - Check superior capability over inode
- * @inode: The inode in question
- * @cap: The capability in question
- *
- * Return true if the current task has the given superior capability
- * targeted at it's own user namespace and that the given inode is owned
- * by the current user namespace or a child namespace.
- *
- * Currently we check to see if an inode is owned by the current
- * user namespace by seeing if the inode's owner maps into the
- * current user namespace.
- *
- */
-bool inode_capable(const struct inode *inode, int cap)
-{
-        struct user_namespace *ns = current_user_ns();
-        return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
-}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892798f..54a36fe288f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,40 +60,18 @@
 #include <linux/eventfd.h>
 #include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_proc */
-#include <linux/kthread.h>
 #include <linux/atomic.h>
-/* css deactivation bias, makes css->refcnt negative to deny new trygets */
-#define CSS_DEACT_BIAS          INT_MIN
-/*
- * cgroup_mutex is the master lock.  Any modification to cgroup or its
- * hierarchy must be performed while holding it.
- *
- * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
- * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
- * release_agent_path and so on.  Modifying requires both cgroup_mutex and
- * cgroup_root_mutex.  Readers can acquire either of the two.  This is to
- * break the following locking order cycle.
- *
- *  A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
- *  B. namespace_sem -> cgroup_mutex
- *
- * B happens only through cgroup_show_options() and using cgroup_root_mutex
- * breaks it.
- */
 static DEFINE_MUTEX(cgroup_mutex);
-static DEFINE_MUTEX(cgroup_root_mutex);
 /*
 * Generate an array of cgroup subsystem pointers. At boot time, this is
- * populated with the built in subsystems, and modular subsystems are
+ * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
 * registered after that. The mutable section of this array is protected by
 * cgroup_mutex.
 */
-#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
+#define SUBSYS(_x) &_x ## _subsys,
-#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
 static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
 #include <linux/cgroup_subsys.h>
 };
@@ -112,13 +90,13 @@ struct cgroupfs_root {
         * The bitmask of subsystems intended to be attached to this
         * hierarchy
         */
-        unsigned long subsys_mask;
+        unsigned long subsys_bits;
        /* Unique id for this hierarchy. */
        int hierarchy_id;
        /* The bitmask of subsystems currently attached to this hierarchy */
-        unsigned long actual_subsys_mask;
+        unsigned long actual_subsys_bits;
        /* A list running through the attached subsystems */
        struct list_head subsys_list;
@@ -132,15 +110,9 @@ struct cgroupfs_root {
        /* A list running through the active hierarchies */
        struct list_head root_list;
-        /* All cgroups on this root, cgroup_mutex protected */
-        struct list_head allcg_list;
        /* Hierarchy-specific flags */
        unsigned long flags;
-        /* IDs for cgroups in this hierarchy */
-        struct ida cgroup_ida;
        /* The path to use for release notifications. */
        char release_agent_path[PATH_MAX];
@@ -156,15 +128,6 @@ struct cgroupfs_root {
 static struct cgroupfs_root rootnode;
 /*
- * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
- */
-struct cfent {
-        struct list_head                node;
-        struct dentry                   *dentry;
-        struct cftype                   *type;
-};
-/*
 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
 * cgroup_subsys->use_id != 0.
 */
@@ -174,8 +137,8 @@ struct css_id {
         * The css to which this ID points. This pointer is set to valid value
         * after cgroup is populated. If cgroup is removed, this will be NULL.
         * This pointer is expected to be RCU-safe because destroy()
-         * is called after synchronize_rcu(). But for safe use, css_tryget()
+         * is called after synchronize_rcu(). But for safe use, css_is_removed()
-         * should be used for avoiding race.
+         * css_tryget() should be used for avoiding race.
         */
        struct cgroup_subsys_state __rcu *css;
        /*
@@ -245,10 +208,6 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
 */
 static int need_forkexit_callback __read_mostly;
-static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-                              struct cftype cfts[], bool is_add);
 #ifdef CONFIG_PROVE_LOCKING
 int cgroup_lock_is_held(void)
 {
@@ -263,19 +222,6 @@ int cgroup_lock_is_held(void)
 EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
-static int css_unbias_refcnt(int refcnt)
-{
-        return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
-}
-/* the current nr of refs, always >= 0 whether @css is deactivated or not */
-static int css_refcnt(struct cgroup_subsys_state *css)
-{
-        int v = atomic_read(&css->refcnt);
-        return css_unbias_refcnt(v);
-}
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
@@ -284,8 +230,7 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
 /* bits in struct cgroupfs_root flags field */
 enum {
-        ROOT_NOPREFIX,  /* mounted subsystems have no named prefix */
+        ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
-        ROOT_XATTR,     /* supports extended attributes */
 };
 static int cgroup_is_releasable(const struct cgroup *cgrp)
@@ -301,6 +246,11 @@ static int notify_on_release(const struct cgroup *cgrp)
        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
+static int clone_children(const struct cgroup *cgrp)
+{
+        return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+}
 /*
 * for_each_subsys() allows you to iterate on each subsystem attached to
 * an active hierarchy
@@ -312,29 +262,41 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
 #define for_each_active_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
+/* the list of cgroups eligible for automatic release. Protected by
+ * release_list_lock */
+static LIST_HEAD(release_list);
+static DEFINE_SPINLOCK(release_list_lock);
+static void cgroup_release_agent(struct work_struct *work);
+static DECLARE_WORK(release_agent_work, cgroup_release_agent);
+static void check_for_release(struct cgroup *cgrp);
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
+ */
+DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
 {
-        return dentry->d_fsdata;
+        if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+                wake_up_all(&cgroup_rmdir_waitq);
 }
-static inline struct cfent *__d_cfe(struct dentry *dentry)
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
 {
-        return dentry->d_fsdata;
+        css_get(css);
 }
-static inline struct cftype *__d_cft(struct dentry *dentry)
+void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
 {
-        return __d_cfe(dentry)->type;
+        cgroup_wakeup_rmdir_waiter(css->cgroup);
+        css_put(css);
 }
-/* the list of cgroups eligible for automatic release. Protected by
- * release_list_lock */
-static LIST_HEAD(release_list);
-static DEFINE_RAW_SPINLOCK(release_list_lock);
-static void cgroup_release_agent(struct work_struct *work);
-static DECLARE_WORK(release_agent_work, cgroup_release_agent);
-static void check_for_release(struct cgroup *cgrp);
 /* Link structure for associating css_set objects with cgroups */
 struct cg_cgroup_link {
        /*
@@ -394,52 +356,43 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
        return &css_set_table[index];
 }
-/* We don't maintain the lists running through each css_set to its
+static void free_css_set_work(struct work_struct *work)
- * task until after the first call to cgroup_iter_start(). This
- * reduces the fork()/exit() overhead for people who have cgroups
- * compiled into their kernel but not actually in use */
-static int use_task_css_set_links __read_mostly;
-static void __put_css_set(struct css_set *cg, int taskexit)
 {
+        struct css_set *cg = container_of(work, struct css_set, work);
        struct cg_cgroup_link *link;
        struct cg_cgroup_link *saved_link;
-        /*
-         * Ensure that the refcount doesn't hit zero while any readers
-         * can see it. Similar to atomic_dec_and_lock(), but for an
-         * rwlock
-         */
-        if (atomic_add_unless(&cg->refcount, -1, 1))
-                return;
-        write_lock(&css_set_lock);
-        if (!atomic_dec_and_test(&cg->refcount)) {
-                write_unlock(&css_set_lock);
-                return;
-        }
-        /* This css_set is dead. unlink it and release cgroup refcounts */
-        hlist_del(&cg->hlist);
-        css_set_count--;
+        write_lock(&css_set_lock);
        list_for_each_entry_safe(link, saved_link, &cg->cg_links,
                                 cg_link_list) {
                struct cgroup *cgrp = link->cgrp;
                list_del(&link->cg_link_list);
                list_del(&link->cgrp_link_list);
-                if (atomic_dec_and_test(&cgrp->count) &&
+                if (atomic_dec_and_test(&cgrp->count)) {
-                    notify_on_release(cgrp)) {
-                        if (taskexit)
-                                set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
+                        cgroup_wakeup_rmdir_waiter(cgrp);
                }
                kfree(link);
        }
        write_unlock(&css_set_lock);
-        kfree_rcu(cg, rcu_head);
+        kfree(cg);
+}
+static void free_css_set_rcu(struct rcu_head *obj)
+{
+        struct css_set *cg = container_of(obj, struct css_set, rcu_head);
+        INIT_WORK(&cg->work, free_css_set_work);
+        schedule_work(&cg->work);
 }
+/* We don't maintain the lists running through each css_set to its
+ * task until after the first call to cgroup_iter_start(). This
+ * reduces the fork()/exit() overhead for people who have cgroups
+ * compiled into their kernel but not actually in use */
+static int use_task_css_set_links __read_mostly;
 /*
 * refcounted get/put for css_set objects
 */
@@ -448,14 +401,26 @@ static inline void get_css_set(struct css_set *cg)
        atomic_inc(&cg->refcount);
 }
-static inline void put_css_set(struct css_set *cg)
+static void put_css_set(struct css_set *cg)
 {
-        __put_css_set(cg, 0);
+        /*
-}
+         * Ensure that the refcount doesn't hit zero while any readers
+         * can see it. Similar to atomic_dec_and_lock(), but for an
+         * rwlock
+         */
+        if (atomic_add_unless(&cg->refcount, -1, 1))
+                return;
+        write_lock(&css_set_lock);
+        if (!atomic_dec_and_test(&cg->refcount)) {
+                write_unlock(&css_set_lock);
+                return;
+        }
-static inline void put_css_set_taskexit(struct css_set *cg)
+        hlist_del(&cg->hlist);
-{
+        css_set_count--;
-        __put_css_set(cg, 1);
+        write_unlock(&css_set_lock);
+        call_rcu(&cg->rcu_head, free_css_set_rcu);
 }
 /*
@@ -560,7 +525,7 @@ static struct css_set *find_existing_css_set(
         * won't change, so no need for locking.
         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                if (root->subsys_mask & (1UL << i)) {
+                if (root->subsys_bits & (1UL << i)) {
                        /* Subsystem is in this hierarchy. So we want
                         * the subsystem state from the new
                         * cgroup */
@@ -784,12 +749,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 *      The task_lock() exception
 *
 * The need for this exception arises from the action of
- * cgroup_attach_task(), which overwrites one task's cgroup pointer with
+ * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
 * another.  It does so using cgroup_mutex, however there are
 * several performance critical places that need to reference
- * task->cgroup without the expense of grabbing a system global
+ * task->cgroups without the expense of grabbing a system global
 * mutex.  Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task's cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task's cgroups pointer we use
 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
 * the task_struct routinely used for such matters.
 *
@@ -825,11 +790,10 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
 * -> cgroup_mkdir.
 */
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
-static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
-static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
+static int cgroup_populate_dir(struct cgroup *cgrp);
-                               unsigned long subsys_mask);
 static const struct inode_operations cgroup_dir_inode_operations;
 static const struct file_operations proc_cgroupstats_operations;
@@ -841,7 +805,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
 static int alloc_css_id(struct cgroup_subsys *ss,
                        struct cgroup *parent, struct cgroup *child);
-static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
+static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 {
        struct inode *inode = new_inode(sb);
@@ -856,6 +820,25 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
        return inode;
 }
+/*
+ * Call subsys's pre_destroy handler.
+ * This is called before css refcnt check.
+ */
+static int cgroup_call_pre_destroy(struct cgroup *cgrp)
+{
+        struct cgroup_subsys *ss;
+        int ret = 0;
+        for_each_subsys(cgrp->root, ss)
+                if (ss->pre_destroy) {
+                        ret = ss->pre_destroy(ss, cgrp);
+                        if (ret)
+                                break;
+                }
+        return ret;
+}
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
        /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -876,7 +859,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                 * Release the subsystem state objects.
                 */
                for_each_subsys(cgrp->root, ss)
-                        ss->css_free(cgrp);
+                        ss->destroy(ss, cgrp);
                cgrp->root->number_of_cgroups--;
                mutex_unlock(&cgroup_mutex);
@@ -893,20 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                 */
                BUG_ON(!list_empty(&cgrp->pidlists));
-                simple_xattrs_free(&cgrp->xattrs);
-                ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
                kfree_rcu(cgrp, rcu_head);
-        } else {
-                struct cfent *cfe = __d_cfe(dentry);
-                struct cgroup *cgrp = dentry->d_parent->d_fsdata;
-                struct cftype *cft = cfe->type;
-                WARN_ONCE(!list_empty(&cfe->node) &&
-                          cgrp != &cgrp->root->top_cgroup,
-                          "cfe still linked for %s\n", cfe->type->name);
-                kfree(cfe);
-                simple_xattrs_free(&cft->xattrs);
        }
        iput(inode);
 }
@@ -925,53 +895,34 @@ static void remove_dir(struct dentry *d)
        dput(parent);
 }
-static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_clear_directory(struct dentry *dentry)
-{
+{
-        struct cfent *cfe;
+        struct list_head *node;
-        lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
+        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
-        lockdep_assert_held(&cgroup_mutex);
+        spin_lock(&dentry->d_lock);
+        node = dentry->d_subdirs.next;
-        list_for_each_entry(cfe, &cgrp->files, node) {
+        while (node != &dentry->d_subdirs) {
-                struct dentry *d = cfe->dentry;
+                struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
-                if (cft && cfe->type != cft)
+                spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
-                        continue;
+                list_del_init(node);
+                if (d->d_inode) {
-                dget(d);
+                        /* This should never be called on a cgroup
-                d_delete(d);
+                         * directory with child cgroups */
-                simple_unlink(cgrp->dentry->d_inode, d);
+                        BUG_ON(d->d_inode->i_mode & S_IFDIR);
-                list_del_init(&cfe->node);
+                        dget_dlock(d);
-                dput(d);
+                        spin_unlock(&d->d_lock);
+                        spin_unlock(&dentry->d_lock);
-                return 0;
+                        d_delete(d);
-        }
+                        simple_unlink(dentry->d_inode, d);
-        return -ENOENT;
+                        dput(d);
-}
+                        spin_lock(&dentry->d_lock);
+                } else
-/**
+                        spin_unlock(&d->d_lock);
- * cgroup_clear_directory - selective removal of base and subsystem files
+                node = dentry->d_subdirs.next;
- * @dir: directory containing the files
- * @base_files: true if the base files should be removed
- * @subsys_mask: mask of the subsystem ids whose files should be removed
- */
-static void cgroup_clear_directory(struct dentry *dir, bool base_files,
-                                   unsigned long subsys_mask)
-{
-        struct cgroup *cgrp = __d_cgrp(dir);
-        struct cgroup_subsys *ss;
-        for_each_subsys(cgrp->root, ss) {
-                struct cftype_set *set;
-                if (!test_bit(ss->subsys_id, &subsys_mask))
-                        continue;
-                list_for_each_entry(set, &ss->cftsets, node)
-                        cgroup_addrm_files(cgrp, NULL, set->cfts, false);
-        }
-        if (base_files) {
-                while (!list_empty(&cgrp->files))
-                        cgroup_rm_file(cgrp, NULL);
        }
+        spin_unlock(&dentry->d_lock);
 }
 /*
@@ -980,9 +931,8 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
 static void cgroup_d_remove_dir(struct dentry *dentry)
 {
        struct dentry *parent;
-        struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
-        cgroup_clear_directory(dentry, true, root->subsys_mask);
+        cgroup_clear_directory(dentry);
        parent = dentry->d_parent;
        spin_lock(&parent->d_lock);
@@ -999,22 +949,21 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 * returns an error, no reference counts are touched.
 */
 static int rebind_subsystems(struct cgroupfs_root *root,
-                              unsigned long final_subsys_mask)
+                              unsigned long final_bits)
 {
-        unsigned long added_mask, removed_mask;
+        unsigned long added_bits, removed_bits;
        struct cgroup *cgrp = &root->top_cgroup;
        int i;
        BUG_ON(!mutex_is_locked(&cgroup_mutex));
-        BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
-        removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
+        removed_bits = root->actual_subsys_bits & ~final_bits;
-        added_mask = final_subsys_mask & ~root->actual_subsys_mask;
+        added_bits = final_bits & ~root->actual_subsys_bits;
        /* Check that any added subsystems are currently free */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                unsigned long bit = 1UL << i;
                struct cgroup_subsys *ss = subsys[i];
-                if (!(bit & added_mask))
+                if (!(bit & added_bits))
                        continue;
                /*
                 * Nobody should tell us to do a subsys that doesn't exist:
@@ -1039,33 +988,37 @@ static int rebind_subsystems(struct cgroupfs_root *root,
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                unsigned long bit = 1UL << i;
-                if (bit & added_mask) {
+                if (bit & added_bits) {
                        /* We're binding this subsystem to this hierarchy */
                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i]);
                        BUG_ON(!dummytop->subsys[i]);
                        BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
+                        mutex_lock(&ss->hierarchy_mutex);
                        cgrp->subsys[i] = dummytop->subsys[i];
                        cgrp->subsys[i]->cgroup = cgrp;
                        list_move(&ss->sibling, &root->subsys_list);
                        ss->root = root;
                        if (ss->bind)
-                                ss->bind(cgrp);
+                                ss->bind(ss, cgrp);
+                        mutex_unlock(&ss->hierarchy_mutex);
                        /* refcount was already taken, and we're keeping it */
-                } else if (bit & removed_mask) {
+                } else if (bit & removed_bits) {
                        /* We're removing this subsystem */
                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+                        mutex_lock(&ss->hierarchy_mutex);
                        if (ss->bind)
-                                ss->bind(dummytop);
+                                ss->bind(ss, dummytop);
                        dummytop->subsys[i]->cgroup = dummytop;
                        cgrp->subsys[i] = NULL;
                        subsys[i]->root = &rootnode;
                        list_move(&ss->sibling, &rootnode.subsys_list);
+                        mutex_unlock(&ss->hierarchy_mutex);
                        /* subsystem is now free - drop reference on module */
                        module_put(ss->module);
-                } else if (bit & final_subsys_mask) {
+                } else if (bit & final_bits) {
                        /* Subsystem state should already exist */
                        BUG_ON(ss == NULL);
                        BUG_ON(!cgrp->subsys[i]);
@@ -1082,39 +1035,37 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        BUG_ON(cgrp->subsys[i]);
                }
        }
-        root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
+        root->subsys_bits = root->actual_subsys_bits = final_bits;
        synchronize_rcu();
        return 0;
 }
-static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
+static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
-        struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
+        struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
        struct cgroup_subsys *ss;
-        mutex_lock(&cgroup_root_mutex);
+        mutex_lock(&cgroup_mutex);
        for_each_subsys(root, ss)
                seq_printf(seq, ",%s", ss->name);
        if (test_bit(ROOT_NOPREFIX, &root->flags))
                seq_puts(seq, ",noprefix");
-        if (test_bit(ROOT_XATTR, &root->flags))
-                seq_puts(seq, ",xattr");
        if (strlen(root->release_agent_path))
                seq_printf(seq, ",release_agent=%s", root->release_agent_path);
-        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
+        if (clone_children(&root->top_cgroup))
                seq_puts(seq, ",clone_children");
        if (strlen(root->name))
                seq_printf(seq, ",name=%s", root->name);
-        mutex_unlock(&cgroup_root_mutex);
+        mutex_unlock(&cgroup_mutex);
        return 0;
 }
 struct cgroup_sb_opts {
-        unsigned long subsys_mask;
+        unsigned long subsys_bits;
        unsigned long flags;
        char *release_agent;
-        bool cpuset_clone_children;
+        bool clone_children;
        char *name;
        /* User explicitly requested empty subsystem */
        bool none;
@@ -1165,11 +1116,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                        continue;
                }
                if (!strcmp(token, "clone_children")) {
-                        opts->cpuset_clone_children = true;
+                        opts->clone_children = true;
-                        continue;
-                }
-                if (!strcmp(token, "xattr")) {
-                        set_bit(ROOT_XATTR, &opts->flags);
                        continue;
                }
                if (!strncmp(token, "release_agent=", 14)) {
@@ -1220,7 +1167,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                        /* Mutually exclusive option 'all' + subsystem name */
                        if (all_ss)
                                return -EINVAL;
-                        set_bit(i, &opts->subsys_mask);
+                        set_bit(i, &opts->subsys_bits);
                        one_ss = true;
                        break;
@@ -1241,7 +1188,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                                continue;
                        if (ss->disabled)
                                continue;
-                        set_bit(i, &opts->subsys_mask);
+                        set_bit(i, &opts->subsys_bits);
                }
        }
@@ -1253,19 +1200,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         * the cpuset subsystem.
         */
        if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
-            (opts->subsys_mask & mask))
+            (opts->subsys_bits & mask))
                return -EINVAL;
        /* Can't specify "none" and some subsystems */
-        if (opts->subsys_mask && opts->none)
+        if (opts->subsys_bits && opts->none)
                return -EINVAL;
        /*
         * We either have to specify by name or by subsystems. (So all
         * empty hierarchies must have a name).
         */
-        if (!opts->subsys_mask && !opts->name)
+        if (!opts->subsys_bits && !opts->name)
                return -EINVAL;
        /*
@@ -1274,10 +1221,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         * take duplicate reference counts on a subsystem that's already used,
         * but rebind_subsystems handles this case.
         */
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
                unsigned long bit = 1UL << i;
-                if (!(bit & opts->subsys_mask))
+                if (!(bit & opts->subsys_bits))
                        continue;
                if (!try_module_get(subsys[i]->module)) {
                        module_pin_failed = true;
@@ -1290,11 +1237,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                 * raced with a module_delete call, and to the user this is
                 * essentially a "subsystem doesn't exist" case.
                 */
-                for (i--; i >= 0; i--) {
+                for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
                        /* drop refcounts only on the ones we took */
                        unsigned long bit = 1UL << i;
-                        if (!(bit & opts->subsys_mask))
+                        if (!(bit & opts->subsys_bits))
                                continue;
                        module_put(subsys[i]->module);
                }
@@ -1304,13 +1251,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        return 0;
 }
-static void drop_parsed_module_refcounts(unsigned long subsys_mask)
+static void drop_parsed_module_refcounts(unsigned long subsys_bits)
 {
        int i;
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
                unsigned long bit = 1UL << i;
-                if (!(bit & subsys_mask))
+                if (!(bit & subsys_bits))
                        continue;
                module_put(subsys[i]->module);
        }
@@ -1322,56 +1269,37 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        struct cgroupfs_root *root = sb->s_fs_info;
        struct cgroup *cgrp = &root->top_cgroup;
        struct cgroup_sb_opts opts;
-        unsigned long added_mask, removed_mask;
        mutex_lock(&cgrp->dentry->d_inode->i_mutex);
        mutex_lock(&cgroup_mutex);
-        mutex_lock(&cgroup_root_mutex);
        /* See what subsystems are wanted */
        ret = parse_cgroupfs_options(data, &opts);
        if (ret)
                goto out_unlock;
-        if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
-                pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
-                           task_tgid_nr(current), current->comm);
-        added_mask = opts.subsys_mask & ~root->subsys_mask;
-        removed_mask = root->subsys_mask & ~opts.subsys_mask;
        /* Don't allow flags or name to change at remount */
        if (opts.flags != root->flags ||
            (opts.name && strcmp(opts.name, root->name))) {
                ret = -EINVAL;
-                drop_parsed_module_refcounts(opts.subsys_mask);
+                drop_parsed_module_refcounts(opts.subsys_bits);
                goto out_unlock;
        }
-        /*
+        ret = rebind_subsystems(root, opts.subsys_bits);
-         * Clear out the files of subsystems that should be removed, do
-         * this before rebind_subsystems, since rebind_subsystems may
-         * change this hierarchy's subsys_list.
-         */
-        cgroup_clear_directory(cgrp->dentry, false, removed_mask);
-        ret = rebind_subsystems(root, opts.subsys_mask);
        if (ret) {
-                /* rebind_subsystems failed, re-populate the removed files */
+                drop_parsed_module_refcounts(opts.subsys_bits);
-                cgroup_populate_dir(cgrp, false, removed_mask);
-                drop_parsed_module_refcounts(opts.subsys_mask);
                goto out_unlock;
        }
-        /* re-populate subsystem files */
+        /* (re)populate subsystem files */
-        cgroup_populate_dir(cgrp, false, added_mask);
+        cgroup_populate_dir(cgrp);
        if (opts.release_agent)
                strcpy(root->release_agent_path, opts.release_agent);
 out_unlock:
        kfree(opts.release_agent);
        kfree(opts.name);
-        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
        return ret;
@@ -1388,29 +1316,23 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
        INIT_LIST_HEAD(&cgrp->sibling);
        INIT_LIST_HEAD(&cgrp->children);
-        INIT_LIST_HEAD(&cgrp->files);
        INIT_LIST_HEAD(&cgrp->css_sets);
-        INIT_LIST_HEAD(&cgrp->allcg_node);
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
        INIT_LIST_HEAD(&cgrp->event_list);
        spin_lock_init(&cgrp->event_list_lock);
-        simple_xattrs_init(&cgrp->xattrs);
 }
 static void init_cgroup_root(struct cgroupfs_root *root)
 {
        struct cgroup *cgrp = &root->top_cgroup;
        INIT_LIST_HEAD(&root->subsys_list);
        INIT_LIST_HEAD(&root->root_list);
-        INIT_LIST_HEAD(&root->allcg_list);
        root->number_of_cgroups = 1;
        cgrp->root = root;
        cgrp->top_cgroup = cgrp;
        init_cgroup_housekeeping(cgrp);
-        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 }
 static bool init_root_id(struct cgroupfs_root *root)
@@ -1451,8 +1373,8 @@ static int cgroup_test_super(struct super_block *sb, void *data)
         * If we asked for subsystems (or explicitly for no
         * subsystems) then they must match
         */
-        if ((opts->subsys_mask || opts->none)
+        if ((opts->subsys_bits || opts->none)
-            && (opts->subsys_mask != root->subsys_mask))
+            && (opts->subsys_bits != root->subsys_bits))
                return 0;
        return 1;
@@ -1462,7 +1384,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 {
        struct cgroupfs_root *root;
-        if (!opts->subsys_mask && !opts->none)
+        if (!opts->subsys_bits && !opts->none)
                return NULL;
        root = kzalloc(sizeof(*root), GFP_KERNEL);
@@ -1475,15 +1397,14 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
        }
        init_cgroup_root(root);
-        root->subsys_mask = opts->subsys_mask;
+        root->subsys_bits = opts->subsys_bits;
        root->flags = opts->flags;
-        ida_init(&root->cgroup_ida);
        if (opts->release_agent)
                strcpy(root->release_agent_path, opts->release_agent);
        if (opts->name)
                strcpy(root->name, opts->name);
-        if (opts->cpuset_clone_children)
+        if (opts->clone_children)
-                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
+                set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
        return root;
 }
@@ -1496,7 +1417,6 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
        spin_lock(&hierarchy_id_lock);
        ida_remove(&hierarchy_ida, root->hierarchy_id);
        spin_unlock(&hierarchy_id_lock);
-        ida_destroy(&root->cgroup_ida);
        kfree(root);
 }
@@ -1509,7 +1429,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
        if (!opts->new_root)
                return -EINVAL;
-        BUG_ON(!opts->subsys_mask && !opts->none);
+        BUG_ON(!opts->subsys_bits && !opts->none);
        ret = set_anon_super(sb, NULL);
        if (ret)
@@ -1535,6 +1455,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
        struct inode *inode =
                cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
+        struct dentry *dentry;
        if (!inode)
                return -ENOMEM;
@@ -1543,9 +1464,12 @@ static int cgroup_get_rootdir(struct super_block *sb)
        inode->i_op = &cgroup_dir_inode_operations;
        /* directories start off with i_nlink == 2 (for "." entry) */
        inc_nlink(inode);
-        sb->s_root = d_make_root(inode);
+        dentry = d_alloc_root(inode);
-        if (!sb->s_root)
+        if (!dentry) {
+                iput(inode);
                return -ENOMEM;
+        }
+        sb->s_root = dentry;
        /* for everything else we want ->d_op set */
        sb->s_d_op = &cgroup_dops;
        return 0;
@@ -1560,7 +1484,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        int ret = 0;
        struct super_block *sb;
        struct cgroupfs_root *new_root;
-        struct inode *inode;
        /* First find the desired set of subsystems */
        mutex_lock(&cgroup_mutex);
@@ -1581,7 +1504,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        opts.new_root = new_root;
        /* Locate an existing or new sb for this hierarchy */
-        sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
+        sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
                cgroup_drop_root(opts.new_root);
@@ -1594,6 +1517,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                /* We used the new root structure, so this is a new hierarchy */
                struct list_head tmp_cg_links;
                struct cgroup *root_cgrp = &root->top_cgroup;
+                struct inode *inode;
                struct cgroupfs_root *existing_root;
                const struct cred *cred;
                int i;
@@ -1607,14 +1531,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                mutex_lock(&inode->i_mutex);
                mutex_lock(&cgroup_mutex);
-                mutex_lock(&cgroup_root_mutex);
-                /* Check for name clashes with existing mounts */
+                if (strlen(root->name)) {
-                ret = -EBUSY;
+                        /* Check for name clashes with existing mounts */
-                if (strlen(root->name))
+                        for_each_active_root(existing_root) {
-                        for_each_active_root(existing_root)
+                                if (!strcmp(existing_root->name, root->name)) {
-                                if (!strcmp(existing_root->name, root->name))
+                                        ret = -EBUSY;
-                                        goto unlock_drop;
+                                        mutex_unlock(&cgroup_mutex);
+                                        mutex_unlock(&inode->i_mutex);
+                                        goto drop_new_super;
+                                }
+                        }
+                }
                /*
                 * We're accessing css_set_count without locking
@@ -1624,13 +1552,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 * have some link structures left over
                 */
                ret = allocate_cg_links(css_set_count, &tmp_cg_links);
-                if (ret)
+                if (ret) {
-                        goto unlock_drop;
+                        mutex_unlock(&cgroup_mutex);
+                        mutex_unlock(&inode->i_mutex);
+                        goto drop_new_super;
+                }
-                ret = rebind_subsystems(root, root->subsys_mask);
+                ret = rebind_subsystems(root, root->subsys_bits);
                if (ret == -EBUSY) {
+                        mutex_unlock(&cgroup_mutex);
+                        mutex_unlock(&inode->i_mutex);
                        free_cg_links(&tmp_cg_links);
-                        goto unlock_drop;
+                        goto drop_new_super;
                }
                /*
                 * There must be no failure case after here, since rebinding
@@ -1662,13 +1595,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                free_cg_links(&tmp_cg_links);
+                BUG_ON(!list_empty(&root_cgrp->sibling));
                BUG_ON(!list_empty(&root_cgrp->children));
                BUG_ON(root->number_of_cgroups != 1);
                cred = override_creds(&init_cred);
-                cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
+                cgroup_populate_dir(root_cgrp);
                revert_creds(cred);
-                mutex_unlock(&cgroup_root_mutex);
                mutex_unlock(&cgroup_mutex);
                mutex_unlock(&inode->i_mutex);
        } else {
@@ -1678,21 +1611,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 */
                cgroup_drop_root(opts.new_root);
                /* no subsys rebinding, so refcounts don't change */
-                drop_parsed_module_refcounts(opts.subsys_mask);
+                drop_parsed_module_refcounts(opts.subsys_bits);
        }
        kfree(opts.release_agent);
        kfree(opts.name);
        return dget(sb->s_root);
- unlock_drop:
-        mutex_unlock(&cgroup_root_mutex);
-        mutex_unlock(&cgroup_mutex);
-        mutex_unlock(&inode->i_mutex);
 drop_new_super:
        deactivate_locked_super(sb);
 drop_modules:
-        drop_parsed_module_refcounts(opts.subsys_mask);
+        drop_parsed_module_refcounts(opts.subsys_bits);
 out_err:
        kfree(opts.release_agent);
        kfree(opts.name);
@@ -1710,9 +1639,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
        BUG_ON(root->number_of_cgroups != 1);
        BUG_ON(!list_empty(&cgrp->children));
+        BUG_ON(!list_empty(&cgrp->sibling));
        mutex_lock(&cgroup_mutex);
-        mutex_lock(&cgroup_root_mutex);
        /* Rebind all subsystems back to the default hierarchy */
        ret = rebind_subsystems(root, 0);
@@ -1738,11 +1667,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
                root_count--;
        }
-        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
-        simple_xattrs_free(&cgrp->xattrs);
        kill_litter_super(sb);
        cgroup_drop_root(root);
 }
@@ -1755,6 +1681,16 @@ static struct file_system_type cgroup_fs_type = {
 static struct kobject *cgroup_kobj;
+static inline struct cgroup *__d_cgrp(struct dentry *dentry)
+{
+        return dentry->d_fsdata;
+}
+static inline struct cftype *__d_cft(struct dentry *dentry)
+{
+        return dentry->d_fsdata;
+}
 /**
 * cgroup_path - generate the path of a cgroup
 * @cgrp: the cgroup in question
@@ -1767,11 +1703,9 @@ static struct kobject *cgroup_kobj;
 */
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
-        struct dentry *dentry = cgrp->dentry;
        char *start;
+        struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
-        rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
+                                                      cgroup_lock_is_held());
-                           "cgroup_path() called without proper locking");
        if (!dentry || cgrp == dummytop) {
                /*
@@ -1782,9 +1716,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
                return 0;
        }
-        start = buf + buflen - 1;
+        start = buf + buflen;
-        *start = '\0';
+        *--start = '\0';
        for (;;) {
                int len = dentry->d_name.len;
@@ -1795,7 +1729,8 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
                if (!cgrp)
                        break;
-                dentry = cgrp->dentry;
+                dentry = rcu_dereference_check(cgrp->dentry,
+                                               cgroup_lock_is_held());
                if (!cgrp->parent)
                        continue;
                if (--start < buf)
@@ -1808,104 +1743,55 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 EXPORT_SYMBOL_GPL(cgroup_path);
 /*
- * Control Group taskset
- */
-struct task_and_cgroup {
-        struct task_struct      *task;
-        struct cgroup           *cgrp;
-        struct css_set          *cg;
-};
-struct cgroup_taskset {
-        struct task_and_cgroup  single;
-        struct flex_array       *tc_array;
-        int                     tc_array_len;
-        int                     idx;
-        struct cgroup           *cur_cgrp;
-};
-/**
- * cgroup_taskset_first - reset taskset and return the first task
- * @tset: taskset of interest
- *
- * @tset iteration is initialized and the first task is returned.
- */
-struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
-{
-        if (tset->tc_array) {
-                tset->idx = 0;
-                return cgroup_taskset_next(tset);
-        } else {
-                tset->cur_cgrp = tset->single.cgrp;
-                return tset->single.task;
-        }
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_first);
-/**
- * cgroup_taskset_next - iterate to the next task in taskset
- * @tset: taskset of interest
- *
- * Return the next task in @tset.  Iteration must have been initialized
- * with cgroup_taskset_first().
- */
-struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
-{
-        struct task_and_cgroup *tc;
-        if (!tset->tc_array || tset->idx >= tset->tc_array_len)
-                return NULL;
-        tc = flex_array_get(tset->tc_array, tset->idx++);
-        tset->cur_cgrp = tc->cgrp;
-        return tc->task;
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_next);
-/**
- * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
- * @tset: taskset of interest
- *
- * Return the cgroup for the current (last returned) task of @tset.  This
- * function must be preceded by either cgroup_taskset_first() or
- * cgroup_taskset_next().
- */
-struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
-{
-        return tset->cur_cgrp;
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
-/**
- * cgroup_taskset_size - return the number of tasks in taskset
- * @tset: taskset of interest
- */
-int cgroup_taskset_size(struct cgroup_taskset *tset)
-{
-        return tset->tc_array ? tset->tc_array_len : 1;
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_size);
-/*
 * cgroup_task_migrate - move a task from one cgroup to another.
 *
- * Must be called with cgroup_mutex and threadgroup locked.
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
 */
-static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
-                                struct task_struct *tsk, struct css_set *newcg)
+                               struct task_struct *tsk, bool guarantee)
 {
        struct css_set *oldcg;
+        struct css_set *newcg;
        /*
-         * We are synchronized through threadgroup_lock() against PF_EXITING
+         * get old css_set. we need to take task_lock and refcount it, because
-         * setting such that we can't race against cgroup_exit() changing the
+         * an exiting task can change its css_set to init_css_set and drop its
-         * css_set to init_css_set and dropping the old one.
+         * old one without taking cgroup_mutex.
         */
-        WARN_ON_ONCE(tsk->flags & PF_EXITING);
+        task_lock(tsk);
        oldcg = tsk->cgroups;
+        get_css_set(oldcg);
+        task_unlock(tsk);
+        /* locate or allocate a new css_set for this task. */
+        if (guarantee) {
+                /* we know the css_set we want already exists. */
+                struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+                read_lock(&css_set_lock);
+                newcg = find_existing_css_set(oldcg, cgrp, template);
+                BUG_ON(!newcg);
+                get_css_set(newcg);
+                read_unlock(&css_set_lock);
+        } else {
+                might_sleep();
+                /* find_css_set will give us newcg already referenced. */
+                newcg = find_css_set(oldcg, cgrp);
+                if (!newcg) {
+                        put_css_set(oldcg);
+                        return -ENOMEM;
+                }
+        }
+        put_css_set(oldcg);
+        /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
        task_lock(tsk);
+        if (tsk->flags & PF_EXITING) {
+                task_unlock(tsk);
+                put_css_set(newcg);
+                return -ESRCH;
+        }
        rcu_assign_pointer(tsk->cgroups, newcg);
        task_unlock(tsk);
@@ -1920,8 +1806,10 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
         * trading it for newcg is protected by cgroup_mutex, we're safe to drop
         * it here; it will be freed under RCU.
         */
-        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
        put_css_set(oldcg);
+        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+        return 0;
 }
 /**
@@ -1929,33 +1817,25 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 * @cgrp: the cgroup the task is attaching to
 * @tsk: the task to be attached
 *
- * Call with cgroup_mutex and threadgroup locked. May take task_lock of
+ * Call holding cgroup_mutex. May take task_lock of
- * @tsk during call.
+ * the task 'tsk' during call.
 */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
-        int retval = 0;
+        int retval;
        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
        struct cgroupfs_root *root = cgrp->root;
-        struct cgroup_taskset tset = { };
+        struct css_set *cg;
-        struct css_set *newcg;
-        /* @tsk either already exited or can't exit until the end */
-        if (tsk->flags & PF_EXITING)
-                return -ESRCH;
        /* Nothing to do if the task is already in that cgroup */
        oldcgrp = task_cgroup_from_root(tsk, root);
        if (cgrp == oldcgrp)
                return 0;
-        tset.single.task = tsk;
-        tset.single.cgrp = oldcgrp;
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                        retval = ss->can_attach(cgrp, &tset);
+                        retval = ss->can_attach(ss, cgrp, tsk);
                        if (retval) {
                                /*
                                 * Remember on which subsystem the can_attach()
@@ -1967,22 +1847,41 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                                goto out;
                        }
                }
+                if (ss->can_attach_task) {
+                        retval = ss->can_attach_task(cgrp, tsk);
+                        if (retval) {
+                                failed_ss = ss;
+                                goto out;
+                        }
+                }
        }
-        newcg = find_css_set(tsk->cgroups, cgrp);
+        task_lock(tsk);
-        if (!newcg) {
+        cg = tsk->cgroups;
-                retval = -ENOMEM;
+        get_css_set(cg);
-                goto out;
+        task_unlock(tsk);
-        }
-        cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
+        retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+        if (retval)
+                goto out;
        for_each_subsys(root, ss) {
+                if (ss->pre_attach)
+                        ss->pre_attach(cgrp);
+                if (ss->attach_task)
+                        ss->attach_task(cgrp, tsk);
                if (ss->attach)
-                        ss->attach(cgrp, &tset);
+                        ss->attach(ss, cgrp, oldcgrp, tsk);
        }
+        set_bit(CGRP_RELEASABLE, &cgrp->flags);
+        /* put_css_set will not destroy cg until after an RCU grace period */
+        put_css_set(cg);
-        synchronize_rcu();
+        /*
+         * wake up rmdir() waiter. the rmdir should fail since the cgroup
+         * is no longer empty.
+         */
+        cgroup_wakeup_rmdir_waiter(cgrp);
 out:
        if (retval) {
                for_each_subsys(root, ss) {
@@ -1995,7 +1894,7 @@ out:
                                 */
                                break;
                        if (ss->cancel_attach)
-                                ss->cancel_attach(cgrp, &tset);
+                                ss->cancel_attach(ss, cgrp, tsk);
                }
        }
        return retval;
@@ -2025,36 +1924,111 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 }
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+/*
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+        struct css_set *cg;
+        struct list_head links;
+};
+static bool css_set_check_fetched(struct cgroup *cgrp,
+                                  struct task_struct *tsk, struct css_set *cg,
+                                  struct list_head *newcg_list)
+{
+        struct css_set *newcg;
+        struct cg_list_entry *cg_entry;
+        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+        read_lock(&css_set_lock);
+        newcg = find_existing_css_set(cg, cgrp, template);
+        if (newcg)
+                get_css_set(newcg);
+        read_unlock(&css_set_lock);
+        /* doesn't exist at all? */
+        if (!newcg)
+                return false;
+        /* see if it's already in the list */
+        list_for_each_entry(cg_entry, newcg_list, links) {
+                if (cg_entry->cg == newcg) {
+                        put_css_set(newcg);
+                        return true;
+                }
+        }
+        /* not found */
+        put_css_set(newcg);
+        return false;
+}
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+                            struct list_head *newcg_list)
+{
+        struct css_set *newcg;
+        struct cg_list_entry *cg_entry;
+        /* ensure a new css_set will exist for this thread */
+        newcg = find_css_set(cg, cgrp);
+        if (!newcg)
+                return -ENOMEM;
+        /* add it to the list */
+        cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+        if (!cg_entry) {
+                put_css_set(newcg);
+                return -ENOMEM;
+        }
+        cg_entry->cg = newcg;
+        list_add(&cg_entry->links, newcg_list);
+        return 0;
+}
 /**
 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
 * @cgrp: the cgroup to attach to
 * @leader: the threadgroup leader task_struct of the group to be attached
 *
- * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
+ * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
- * task_lock of each thread in leader's threadgroup individually in turn.
+ * take task_lock of each thread in leader's threadgroup individually in turn.
 */
-static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 {
        int retval, i, group_size;
        struct cgroup_subsys *ss, *failed_ss = NULL;
+        bool cancel_failed_ss = false;
        /* guaranteed to be initialized later, but the compiler needs this */
+        struct cgroup *oldcgrp = NULL;
+        struct css_set *oldcg;
        struct cgroupfs_root *root = cgrp->root;
        /* threadgroup list cursor and array */
        struct task_struct *tsk;
-        struct task_and_cgroup *tc;
        struct flex_array *group;
-        struct cgroup_taskset tset = { };
+        /*
+         * we need to make sure we have css_sets for all the tasks we're
+         * going to move -before- we actually start moving them, so that in
+         * case we get an ENOMEM we can bail out before making any changes.
+         */
+        struct list_head newcg_list;
+        struct cg_list_entry *cg_entry, *temp_nobe;
        /*
         * step 0: in order to do expensive, possibly blocking operations for
         * every thread, we cannot iterate the thread group list, since it needs
         * rcu or tasklist locked. instead, build an array of all threads in the
-         * group - group_rwsem prevents new threads from appearing, and if
+         * group - threadgroup_fork_lock prevents new threads from appearing,
-         * threads exit, this will just be an over-estimate.
+         * and if threads exit, this will just be an over-estimate.
         */
        group_size = get_nr_threads(leader);
        /* flex_array supports very large thread-groups better than kmalloc. */
-        group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
+        group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+                                 GFP_KERNEL);
        if (!group)
                return -ENOMEM;
        /* pre-allocate to guarantee space while iterating in rcu read-side. */
@@ -2062,124 +2036,189 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
        if (retval)
                goto out_free_group_list;
+        /* prevent changes to the threadgroup list while we take a snapshot. */
+        rcu_read_lock();
+        if (!thread_group_leader(leader)) {
+                /*
+                 * a race with de_thread from another thread's exec() may strip
+                 * us of our leadership, making while_each_thread unsafe to use
+                 * on this task. if this happens, there is no choice but to
+                 * throw this task away and try again (from cgroup_procs_write);
+                 * this is "double-double-toil-and-trouble-check locking".
+                 */
+                rcu_read_unlock();
+                retval = -EAGAIN;
+                goto out_free_group_list;
+        }
+        /* take a reference on each task in the group to go in the array. */
        tsk = leader;
        i = 0;
-        /*
-         * Prevent freeing of tasks while we take a snapshot. Tasks that are
-         * already PF_EXITING could be freed from underneath us unless we
-         * take an rcu_read_lock.
-         */
-        rcu_read_lock();
        do {
-                struct task_and_cgroup ent;
-                /* @tsk either already exited or can't exit until the end */
-                if (tsk->flags & PF_EXITING)
-                        continue;
                /* as per above, nr_threads may decrease, but not increase. */
                BUG_ON(i >= group_size);
-                ent.task = tsk;
+                get_task_struct(tsk);
-                ent.cgrp = task_cgroup_from_root(tsk, root);
-                /* nothing to do if this task is already in the cgroup */
-                if (ent.cgrp == cgrp)
-                        continue;
                /*
                 * saying GFP_ATOMIC has no effect here because we did prealloc
                 * earlier, but it's good form to communicate our expectations.
                 */
-                retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
+                retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
                BUG_ON(retval != 0);
                i++;
        } while_each_thread(leader, tsk);
-        rcu_read_unlock();
        /* remember the number of threads in the array for later. */
        group_size = i;
-        tset.tc_array = group;
+        rcu_read_unlock();
-        tset.tc_array_len = group_size;
-        /* methods shouldn't be called if no task is actually migrating */
-        retval = 0;
-        if (!group_size)
-                goto out_free_group_list;
        /*
         * step 1: check that we can legitimately attach to the cgroup.
         */
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                        retval = ss->can_attach(cgrp, &tset);
+                        retval = ss->can_attach(ss, cgrp, leader);
                        if (retval) {
                                failed_ss = ss;
                                goto out_cancel_attach;
                        }
                }
+                /* a callback to be run on every thread in the threadgroup. */
+                if (ss->can_attach_task) {
+                        /* run on each task in the threadgroup. */
+                        for (i = 0; i < group_size; i++) {
+                                tsk = flex_array_get_ptr(group, i);
+                                retval = ss->can_attach_task(cgrp, tsk);
+                                if (retval) {
+                                        failed_ss = ss;
+                                        cancel_failed_ss = true;
+                                        goto out_cancel_attach;
+                                }
+                        }
+                }
        }
        /*
         * step 2: make sure css_sets exist for all threads to be migrated.
         * we use find_css_set, which allocates a new one if necessary.
         */
+        INIT_LIST_HEAD(&newcg_list);
        for (i = 0; i < group_size; i++) {
-                tc = flex_array_get(group, i);
+                tsk = flex_array_get_ptr(group, i);
-                tc->cg = find_css_set(tc->task->cgroups, cgrp);
+                /* nothing to do if this task is already in the cgroup */
-                if (!tc->cg) {
+                oldcgrp = task_cgroup_from_root(tsk, root);
-                        retval = -ENOMEM;
+                if (cgrp == oldcgrp)
-                        goto out_put_css_set_refs;
+                        continue;
+                /* get old css_set pointer */
+                task_lock(tsk);
+                oldcg = tsk->cgroups;
+                get_css_set(oldcg);
+                task_unlock(tsk);
+                /* see if the new one for us is already in the list? */
+                if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+                        /* was already there, nothing to do. */
+                        put_css_set(oldcg);
+                } else {
+                        /* we don't already have it. get new one. */
+                        retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+                        put_css_set(oldcg);
+                        if (retval)
+                                goto out_list_teardown;
                }
        }
        /*
-         * step 3: now that we're guaranteed success wrt the css_sets,
+         * step 3: now that we're guaranteed success wrt the css_sets, proceed
-         * proceed to move all tasks to the new cgroup.  There are no
+         * to move all tasks to the new cgroup, calling ss->attach_task for each
-         * failure cases after here, so this is the commit point.
+         * one along the way. there are no failure cases after here, so this is
+         * the commit point.
         */
+        for_each_subsys(root, ss) {
+                if (ss->pre_attach)
+                        ss->pre_attach(cgrp);
+        }
        for (i = 0; i < group_size; i++) {
-                tc = flex_array_get(group, i);
+                tsk = flex_array_get_ptr(group, i);
-                cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
+                /* leave current thread as it is if it's already there */
+                oldcgrp = task_cgroup_from_root(tsk, root);
+                if (cgrp == oldcgrp)
+                        continue;
+                /* attach each task to each subsystem */
+                for_each_subsys(root, ss) {
+                        if (ss->attach_task)
+                                ss->attach_task(cgrp, tsk);
+                }
+                /* if the thread is PF_EXITING, it can just get skipped. */
+                retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+                BUG_ON(retval != 0 && retval != -ESRCH);
        }
        /* nothing is sensitive to fork() after this point. */
        /*
-         * step 4: do subsystem attach callbacks.
+         * step 4: do expensive, non-thread-specific subsystem callbacks.
+         * TODO: if ever a subsystem needs to know the oldcgrp for each task
+         * being moved, this call will need to be reworked to communicate that.
         */
        for_each_subsys(root, ss) {
                if (ss->attach)
-                        ss->attach(cgrp, &tset);
+                        ss->attach(ss, cgrp, oldcgrp, leader);
        }
        /*
         * step 5: success! and cleanup
         */
        synchronize_rcu();
+        cgroup_wakeup_rmdir_waiter(cgrp);
        retval = 0;
-out_put_css_set_refs:
+out_list_teardown:
-        if (retval) {
+        /* clean up the list of prefetched css_sets. */
-                for (i = 0; i < group_size; i++) {
+        list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
-                        tc = flex_array_get(group, i);
+                list_del(&cg_entry->links);
-                        if (!tc->cg)
+                put_css_set(cg_entry->cg);
-                                break;
+                kfree(cg_entry);
-                        put_css_set(tc->cg);
-                }
        }
 out_cancel_attach:
+        /* same deal as in cgroup_attach_task */
        if (retval) {
                for_each_subsys(root, ss) {
-                        if (ss == failed_ss)
+                        if (ss == failed_ss) {
+                                if (cancel_failed_ss && ss->cancel_attach)
+                                        ss->cancel_attach(ss, cgrp, leader);
                                break;
+                        }
                        if (ss->cancel_attach)
-                                ss->cancel_attach(cgrp, &tset);
+                                ss->cancel_attach(ss, cgrp, leader);
                }
        }
+        /* clean up the array of referenced threads in the group. */
+        for (i = 0; i < group_size; i++) {
+                tsk = flex_array_get_ptr(group, i);
+                put_task_struct(tsk);
+        }
 out_free_group_list:
        flex_array_free(group);
        return retval;
 }
+static int cgroup_allow_attach(struct cgroup *cgrp, struct task_struct *tsk)
+{
+        struct cgroup_subsys *ss;
+        int ret;
+        for_each_subsys(cgrp->root, ss) {
+                if (ss->allow_attach) {
+                        ret = ss->allow_attach(cgrp, tsk);
+                        if (ret)
+                                return ret;
+                } else {
+                        return -EACCES;
+                }
+        }
+        return 0;
+}
 /*
 * Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will lock
+ * function to attach either it or all tasks in its threadgroup. Will take
- * cgroup_mutex and threadgroup; may take task_lock of task.
+ * cgroup_mutex; may take task_lock of task.
 */
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
@@ -2190,68 +2229,66 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
        if (!cgroup_lock_live_group(cgrp))
                return -ENODEV;
-retry_find_task:
-        rcu_read_lock();
        if (pid) {
+                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
                if (!tsk) {
                        rcu_read_unlock();
-                        ret= -ESRCH;
+                        cgroup_unlock();
-                        goto out_unlock_cgroup;
+                        return -ESRCH;
                }
+                if (threadgroup) {
+                        /*
+                         * RCU protects this access, since tsk was found in the
+                         * tid map. a race with de_thread may cause group_leader
+                         * to stop being the leader, but cgroup_attach_proc will
+                         * detect it later.
+                         */
+                        tsk = tsk->group_leader;
+                } else if (tsk->flags & PF_EXITING) {
+                        /* optimization for the single-task-only case */
+                        rcu_read_unlock();
+                        cgroup_unlock();
+                        return -ESRCH;
+                }
                /*
                 * even if we're attaching all tasks in the thread group, we
                 * only need to check permissions on one of them.
                 */
                tcred = __task_cred(tsk);
-                if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+                if (cred->euid &&
-                    !uid_eq(cred->euid, tcred->uid) &&
+                    cred->euid != tcred->uid &&
-                    !uid_eq(cred->euid, tcred->suid)) {
+                    cred->euid != tcred->suid) {
-                        rcu_read_unlock();
+                        /*
-                        ret = -EACCES;
+                         * if the default permission check fails, give each
-                        goto out_unlock_cgroup;
+                         * cgroup a chance to extend the permission check
+                         */
+                        ret = cgroup_allow_attach(cgrp, tsk);
+                        if (ret) {
+                                rcu_read_unlock();
+                                cgroup_unlock();
+                                return ret;
+                        }
                }
-        } else
+                get_task_struct(tsk);
-                tsk = current;
-        if (threadgroup)
-                tsk = tsk->group_leader;
-        /*
-         * Workqueue threads may acquire PF_THREAD_BOUND and become
-         * trapped in a cpuset, or RT worker may be born in a cgroup
-         * with no rt_runtime allocated.  Just say no.
-         */
-        if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
-                ret = -EINVAL;
                rcu_read_unlock();
-                goto out_unlock_cgroup;
+        } else {
+                if (threadgroup)
+                        tsk = current->group_leader;
+                else
+                        tsk = current;
+                get_task_struct(tsk);
        }
-        get_task_struct(tsk);
-        rcu_read_unlock();
-        threadgroup_lock(tsk);
        if (threadgroup) {
-                if (!thread_group_leader(tsk)) {
+                threadgroup_fork_write_lock(tsk);
-                        /*
-                         * a race with de_thread from another thread's exec()
-                         * may strip us of our leadership, if this happens,
-                         * there is no choice but to throw this task away and
-                         * try again; this is
-                         * "double-double-toil-and-trouble-check locking".
-                         */
-                        threadgroup_unlock(tsk);
-                        put_task_struct(tsk);
-                        goto retry_find_task;
-                }
                ret = cgroup_attach_proc(cgrp, tsk);
-        } else
+                threadgroup_fork_write_unlock(tsk);
+        } else {
                ret = cgroup_attach_task(cgrp, tsk);
-        threadgroup_unlock(tsk);
+        }
        put_task_struct(tsk);
-out_unlock_cgroup:
        cgroup_unlock();
        return ret;
 }
@@ -2263,7 +2300,16 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
 static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
 {
-        return attach_task_by_pid(cgrp, tgid, true);
+        int ret;
+        do {
+                /*
+                 * attach_proc fails with -EAGAIN if threadgroup leadership
+                 * changes in the middle of the operation, in which case we need
+                 * to find the task_struct for the new leader and start over.
+                 */
+                ret = attach_task_by_pid(cgrp, tgid, true);
+        } while (ret == -EAGAIN);
+        return ret;
 }
 /**
@@ -2292,9 +2338,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
                return -EINVAL;
        if (!cgroup_lock_live_group(cgrp))
                return -ENODEV;
-        mutex_lock(&cgroup_root_mutex);
        strcpy(cgrp->root->release_agent_path, buffer);
-        mutex_unlock(&cgroup_root_mutex);
        cgroup_unlock();
        return 0;
 }
@@ -2540,64 +2584,6 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
        return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
 }
-static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
-{
-        if (S_ISDIR(dentry->d_inode->i_mode))
-                return &__d_cgrp(dentry)->xattrs;
-        else
-                return &__d_cft(dentry)->xattrs;
-}
-static inline int xattr_enabled(struct dentry *dentry)
-{
-        struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
-        return test_bit(ROOT_XATTR, &root->flags);
-}
-static bool is_valid_xattr(const char *name)
-{
-        if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
-            !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
-                return true;
-        return false;
-}
-static int cgroup_setxattr(struct dentry *dentry, const char *name,
-                           const void *val, size_t size, int flags)
-{
-        if (!xattr_enabled(dentry))
-                return -EOPNOTSUPP;
-        if (!is_valid_xattr(name))
-                return -EINVAL;
-        return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
-}
-static int cgroup_removexattr(struct dentry *dentry, const char *name)
-{
-        if (!xattr_enabled(dentry))
-                return -EOPNOTSUPP;
-        if (!is_valid_xattr(name))
-                return -EINVAL;
-        return simple_xattr_remove(__d_xattrs(dentry), name);
-}
-static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
-                               void *buf, size_t size)
-{
-        if (!xattr_enabled(dentry))
-                return -EOPNOTSUPP;
-        if (!is_valid_xattr(name))
-                return -EINVAL;
-        return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
-}
-static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
-{
-        if (!xattr_enabled(dentry))
-                return -EOPNOTSUPP;
-        return simple_xattr_list(__d_xattrs(dentry), buf, size);
-}
 static const struct file_operations cgroup_file_operations = {
        .read = cgroup_file_read,
        .write = cgroup_file_write,
@@ -2606,25 +2592,14 @@ static const struct file_operations cgroup_file_operations = {
        .release = cgroup_file_release,
 };
-static const struct inode_operations cgroup_file_inode_operations = {
-        .setxattr = cgroup_setxattr,
-        .getxattr = cgroup_getxattr,
-        .listxattr = cgroup_listxattr,
-        .removexattr = cgroup_removexattr,
-};
 static const struct inode_operations cgroup_dir_inode_operations = {
        .lookup = cgroup_lookup,
        .mkdir = cgroup_mkdir,
        .rmdir = cgroup_rmdir,
        .rename = cgroup_rename,
-        .setxattr = cgroup_setxattr,
-        .getxattr = cgroup_getxattr,
-        .listxattr = cgroup_listxattr,
-        .removexattr = cgroup_removexattr,
 };
-static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
@@ -2642,7 +2617,7 @@ static inline struct cftype *__file_cft(struct file *file)
        return __d_cft(file->f_dentry);
 }
-static int cgroup_create_file(struct dentry *dentry, umode_t mode,
+static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                                struct super_block *sb)
 {
        struct inode *inode;
@@ -2662,27 +2637,45 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
                /* start off with i_nlink == 2 (for "." entry) */
                inc_nlink(inode);
-                inc_nlink(dentry->d_parent->d_inode);
-                /*
+                /* start with the directory inode held, so that we can
-                 * Control reaches here with cgroup_mutex held.
+                 * populate it without racing with another mkdir */
-                 * @inode->i_mutex should nest outside cgroup_mutex but we
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-                 * want to populate it immediately without releasing
-                 * cgroup_mutex.  As @inode isn't visible to anyone else
-                 * yet, trylock will always succeed without affecting
-                 * lockdep checks.
-                 */
-                WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
        } else if (S_ISREG(mode)) {
                inode->i_size = 0;
                inode->i_fop = &cgroup_file_operations;
-                inode->i_op = &cgroup_file_inode_operations;
        }
        d_instantiate(dentry, inode);
        dget(dentry);   /* Extra count - pin the dentry in core */
        return 0;
 }
+/*
+ * cgroup_create_dir - create a directory for an object.
+ * @cgrp: the cgroup we create the directory for. It must have a valid
+ *        ->parent field. And we are going to fill its ->dentry field.
+ * @dentry: dentry of the new cgroup
+ * @mode: mode to set on new directory.
+ */
+static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
+                                mode_t mode)
+{
+        struct dentry *parent;
+        int error = 0;
+        parent = cgrp->parent->dentry;
+        error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
+        if (!error) {
+                dentry->d_fsdata = cgrp;
+                inc_nlink(parent->d_inode);
+                rcu_assign_pointer(cgrp->dentry, dentry);
+                dget(dentry);
+        }
+        dput(dentry);
+        return error;
+}
 /**
 * cgroup_file_mode - deduce file mode of a control file
 * @cft: the control file in question
@@ -2692,9 +2685,9 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
 * returns S_IRUGO if it has only a read handler
 * returns S_IWUSR if it has only a write hander
 */
-static umode_t cgroup_file_mode(const struct cftype *cft)
+static mode_t cgroup_file_mode(const struct cftype *cft)
 {
-        umode_t mode = 0;
+        mode_t mode = 0;
        if (cft->mode)
                return cft->mode;
@@ -2710,193 +2703,50 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
        return mode;
 }
-static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+int cgroup_add_file(struct cgroup *cgrp,
-                           struct cftype *cft)
+                       struct cgroup_subsys *subsys,
+                       const struct cftype *cft)
 {
        struct dentry *dir = cgrp->dentry;
-        struct cgroup *parent = __d_cgrp(dir);
        struct dentry *dentry;
-        struct cfent *cfe;
        int error;
-        umode_t mode;
+        mode_t mode;
-        char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
-        simple_xattrs_init(&cft->xattrs);
+        char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
        if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
                strcpy(name, subsys->name);
                strcat(name, ".");
        }
        strcat(name, cft->name);
        BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
-        cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
-        if (!cfe)
-                return -ENOMEM;
        dentry = lookup_one_len(name, dir, strlen(name));
-        if (IS_ERR(dentry)) {
+        if (!IS_ERR(dentry)) {
+                mode = cgroup_file_mode(cft);
+                error = cgroup_create_file(dentry, mode | S_IFREG,
+                                                cgrp->root->sb);
+                if (!error)
+                        dentry->d_fsdata = (void *)cft;
+                dput(dentry);
+        } else
                error = PTR_ERR(dentry);
-                goto out;
-        }
-        mode = cgroup_file_mode(cft);
-        error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
-        if (!error) {
-                cfe->type = (void *)cft;
-                cfe->dentry = dentry;
-                dentry->d_fsdata = cfe;
-                list_add_tail(&cfe->node, &parent->files);
-                cfe = NULL;
-        }
-        dput(dentry);
-out:
-        kfree(cfe);
        return error;
 }
+EXPORT_SYMBOL_GPL(cgroup_add_file);
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+int cgroup_add_files(struct cgroup *cgrp,
-                              struct cftype cfts[], bool is_add)
+                        struct cgroup_subsys *subsys,
-{
+                        const struct cftype cft[],
-        struct cftype *cft;
+                        int count)
-        int err, ret = 0;
-        for (cft = cfts; cft->name[0] != '\0'; cft++) {
-                /* does cft->flags tell us to skip this file on @cgrp? */
-                if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
-                        continue;
-                if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
-                        continue;
-                if (is_add)
-                        err = cgroup_add_file(cgrp, subsys, cft);
-                else
-                        err = cgroup_rm_file(cgrp, cft);
-                if (err) {
-                        pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
-                                   is_add ? "add" : "remove", cft->name, err);
-                        ret = err;
-                }
-        }
-        return ret;
-}
-static DEFINE_MUTEX(cgroup_cft_mutex);
-static void cgroup_cfts_prepare(void)
-        __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
 {
-        /*
+        int i, err;
-         * Thanks to the entanglement with vfs inode locking, we can't walk
+        for (i = 0; i < count; i++) {
-         * the existing cgroups under cgroup_mutex and create files.
+                err = cgroup_add_file(cgrp, subsys, &cft[i]);
-         * Instead, we increment reference on all cgroups and build list of
+                if (err)
-         * them using @cgrp->cft_q_node.  Grab cgroup_cft_mutex to ensure
+                        return err;
-         * exclusive access to the field.
-         */
-        mutex_lock(&cgroup_cft_mutex);
-        mutex_lock(&cgroup_mutex);
-}
-static void cgroup_cfts_commit(struct cgroup_subsys *ss,
-                               struct cftype *cfts, bool is_add)
-        __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
-{
-        LIST_HEAD(pending);
-        struct cgroup *cgrp, *n;
-        /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
-        if (cfts && ss->root != &rootnode) {
-                list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
-                        dget(cgrp->dentry);
-                        list_add_tail(&cgrp->cft_q_node, &pending);
-                }
-        }
-        mutex_unlock(&cgroup_mutex);
-        /*
-         * All new cgroups will see @cfts update on @ss->cftsets.  Add/rm
-         * files for all cgroups which were created before.
-         */
-        list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
-                struct inode *inode = cgrp->dentry->d_inode;
-                mutex_lock(&inode->i_mutex);
-                mutex_lock(&cgroup_mutex);
-                if (!cgroup_is_removed(cgrp))
-                        cgroup_addrm_files(cgrp, ss, cfts, is_add);
-                mutex_unlock(&cgroup_mutex);
-                mutex_unlock(&inode->i_mutex);
-                list_del_init(&cgrp->cft_q_node);
-                dput(cgrp->dentry);
        }
-        mutex_unlock(&cgroup_cft_mutex);
-}
-/**
- * cgroup_add_cftypes - add an array of cftypes to a subsystem
- * @ss: target cgroup subsystem
- * @cfts: zero-length name terminated array of cftypes
- *
- * Register @cfts to @ss.  Files described by @cfts are created for all
- * existing cgroups to which @ss is attached and all future cgroups will
- * have them too.  This function can be called anytime whether @ss is
- * attached or not.
- *
- * Returns 0 on successful registration, -errno on failure.  Note that this
- * function currently returns 0 as long as @cfts registration is successful
- * even if some file creation attempts on existing cgroups fail.
- */
-int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
-{
-        struct cftype_set *set;
-        set = kzalloc(sizeof(*set), GFP_KERNEL);
-        if (!set)
-                return -ENOMEM;
-        cgroup_cfts_prepare();
-        set->cfts = cfts;
-        list_add_tail(&set->node, &ss->cftsets);
-        cgroup_cfts_commit(ss, cfts, true);
        return 0;
 }
-EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
+EXPORT_SYMBOL_GPL(cgroup_add_files);
-/**
- * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
- * @ss: target cgroup subsystem
- * @cfts: zero-length name terminated array of cftypes
- *
- * Unregister @cfts from @ss.  Files described by @cfts are removed from
- * all existing cgroups to which @ss is attached and all future cgroups
- * won't have them either.  This function can be called anytime whether @ss
- * is attached or not.
- *
- * Returns 0 on successful unregistration, -ENOENT if @cfts is not
- * registered with @ss.
- */
-int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
-{
-        struct cftype_set *set;
-        cgroup_cfts_prepare();
-        list_for_each_entry(set, &ss->cftsets, node) {
-                if (set->cfts == cfts) {
-                        list_del_init(&set->node);
-                        cgroup_cfts_commit(ss, cfts, false);
-                        return 0;
-                }
-        }
-        cgroup_cfts_commit(ss, NULL, false);
-        return -ENOENT;
-}
 /**
 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2947,20 +2797,15 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
 * using their cgroups capability, we don't maintain the lists running
 * through each css_set to its tasks until we see the list actually
 * used - in other words after the first call to cgroup_iter_start().
+ *
+ * The tasklist_lock is not held here, as do_each_thread() and
+ * while_each_thread() are protected by RCU.
 */
 static void cgroup_enable_task_cg_lists(void)
 {
        struct task_struct *p, *g;
        write_lock(&css_set_lock);
        use_task_css_set_links = 1;
-        /*
-         * We need tasklist_lock because RCU is not safe against
-         * while_each_thread(). Besides, a forking task that has passed
-         * cgroup_post_fork() without seeing use_task_css_set_links = 1
-         * is not guaranteed to have its child immediately visible in the
-         * tasklist if we walk through it with RCU.
-         */
-        read_lock(&tasklist_lock);
        do_each_thread(g, p) {
                task_lock(p);
                /*
@@ -2972,98 +2817,10 @@ static void cgroup_enable_task_cg_lists(void)
                        list_add(&p->cg_list, &p->cgroups->tasks);
                task_unlock(p);
        } while_each_thread(g, p);
-        read_unlock(&tasklist_lock);
        write_unlock(&css_set_lock);
 }
-/**
- * cgroup_next_descendant_pre - find the next descendant for pre-order walk
- * @pos: the current position (%NULL to initiate traversal)
- * @cgroup: cgroup whose descendants to walk
- *
- * To be used by cgroup_for_each_descendant_pre().  Find the next
- * descendant to visit for pre-order traversal of @cgroup's descendants.
- */
-struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
-                                          struct cgroup *cgroup)
-{
-        struct cgroup *next;
-        WARN_ON_ONCE(!rcu_read_lock_held());
-        /* if first iteration, pretend we just visited @cgroup */
-        if (!pos) {
-                if (list_empty(&cgroup->children))
-                        return NULL;
-                pos = cgroup;
-        }
-        /* visit the first child if exists */
-        next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
-        if (next)
-                return next;
-        /* no child, visit my or the closest ancestor's next sibling */
-        do {
-                next = list_entry_rcu(pos->sibling.next, struct cgroup,
-                                      sibling);
-                if (&next->sibling != &pos->parent->children)
-                        return next;
-                pos = pos->parent;
-        } while (pos != cgroup);
-        return NULL;
-}
-EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
-static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
-{
-        struct cgroup *last;
-        do {
-                last = pos;
-                pos = list_first_or_null_rcu(&pos->children, struct cgroup,
-                                             sibling);
-        } while (pos);
-        return last;
-}
-/**
- * cgroup_next_descendant_post - find the next descendant for post-order walk
- * @pos: the current position (%NULL to initiate traversal)
- * @cgroup: cgroup whose descendants to walk
- *
- * To be used by cgroup_for_each_descendant_post().  Find the next
- * descendant to visit for post-order traversal of @cgroup's descendants.
- */
-struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
-                                           struct cgroup *cgroup)
-{
-        struct cgroup *next;
-        WARN_ON_ONCE(!rcu_read_lock_held());
-        /* if first iteration, visit the leftmost descendant */
-        if (!pos) {
-                next = cgroup_leftmost_descendant(cgroup);
-                return next != cgroup ? next : NULL;
-        }
-        /* if there's an unvisited sibling, visit its leftmost descendant */
-        next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
-        if (&next->sibling != &pos->parent->children)
-                return cgroup_leftmost_descendant(next);
-        /* no sibling left, visit parent */
-        next = pos->parent;
-        return next != cgroup ? next : NULL;
-}
-EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
-        __acquires(css_set_lock)
 {
        /*
         * The first time anyone tries to iterate across a cgroup,
@@ -3103,7 +2860,6 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 }
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
-        __releases(css_set_lock)
 {
        read_unlock(&css_set_lock);
 }
@@ -3278,38 +3034,6 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
 *
 */
-/* which pidlist file are we talking about? */
-enum cgroup_filetype {
-        CGROUP_FILE_PROCS,
-        CGROUP_FILE_TASKS,
-};
-/*
- * A pidlist is a list of pids that virtually represents the contents of one
- * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
- * a pair (one each for procs, tasks) for each pid namespace that's relevant
- * to the cgroup.
- */
-struct cgroup_pidlist {
-        /*
-         * used to find which pidlist is wanted. doesn't change as long as
-         * this particular list stays in the list.
-        */
-        struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
-        /* array of xids */
-        pid_t *list;
-        /* how many elements the above list has */
-        int length;
-        /* how many files are using the current array */
-        int use_count;
-        /* each of these stored in a list by its cgroup */
-        struct list_head links;
-        /* pointer to the cgroup we belong to, for list removal purposes */
-        struct cgroup *owner;
-        /* protects the other fields */
-        struct rw_semaphore mutex;
-};
 /*
 * The following two functions "fix" the issue where there are more pids
 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3408,7 +3132,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 {
        struct cgroup_pidlist *l;
        /* don't need task_nsproxy() if we're looking at ourself */
-        struct pid_namespace *ns = task_active_pid_ns(current);
+        struct pid_namespace *ns = current->nsproxy->pid_ns;
        /*
         * We can't drop the pidlist_mutex before taking the l->mutex in case
@@ -3775,7 +3499,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
        if (flags & POLLHUP) {
                __remove_wait_queue(event->wqh, &event->wait);
                spin_lock(&cgrp->event_list_lock);
-                list_del_init(&event->list);
+                list_del(&event->list);
                spin_unlock(&cgrp->event_list_lock);
                /*
                 * We are in atomic context, but cgroup_event_remove() may
@@ -3912,7 +3636,7 @@ fail:
 static u64 cgroup_clone_children_read(struct cgroup *cgrp,
                                    struct cftype *cft)
 {
-        return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+        return clone_children(cgrp);
 }
 static int cgroup_clone_children_write(struct cgroup *cgrp,
@@ -3920,9 +3644,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
                                     u64 val)
 {
        if (val)
-                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+                set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
        else
-                clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+                clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
        return 0;
 }
@@ -3961,44 +3685,36 @@ static struct cftype files[] = {
                .read_u64 = cgroup_clone_children_read,
                .write_u64 = cgroup_clone_children_write,
        },
-        {
-                .name = "release_agent",
-                .flags = CFTYPE_ONLY_ON_ROOT,
-                .read_seq_string = cgroup_release_agent_show,
-                .write_string = cgroup_release_agent_write,
-                .max_write_len = PATH_MAX,
-        },
-        { }     /* terminate */
 };
-/**
+static struct cftype cft_release_agent = {
- * cgroup_populate_dir - selectively creation of files in a directory
+        .name = "release_agent",
- * @cgrp: target cgroup
+        .read_seq_string = cgroup_release_agent_show,
- * @base_files: true if the base files should be added
+        .write_string = cgroup_release_agent_write,
- * @subsys_mask: mask of the subsystem ids whose files should be added
+        .max_write_len = PATH_MAX,
- */
+};
-static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
-                               unsigned long subsys_mask)
+static int cgroup_populate_dir(struct cgroup *cgrp)
 {
        int err;
        struct cgroup_subsys *ss;
-        if (base_files) {
+        /* First clear out any existing files */
-                err = cgroup_addrm_files(cgrp, NULL, files, true);
+        cgroup_clear_directory(cgrp->dentry);
-                if (err < 0)
+        err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
+        if (err < 0)
+                return err;
+        if (cgrp == cgrp->top_cgroup) {
+                if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
                        return err;
        }
-        /* process cftsets of each subsystem */
        for_each_subsys(cgrp->root, ss) {
-                struct cftype_set *set;
+                if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
-                if (!test_bit(ss->subsys_id, &subsys_mask))
+                        return err;
-                        continue;
-                list_for_each_entry(set, &ss->cftsets, node)
-                        cgroup_addrm_files(cgrp, ss, set->cfts, true);
        }
        /* This cgroup is ready now */
        for_each_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -4014,18 +3730,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
        return 0;
 }
-static void css_dput_fn(struct work_struct *work)
-{
-        struct cgroup_subsys_state *css =
-                container_of(work, struct cgroup_subsys_state, dput_work);
-        struct dentry *dentry = css->cgroup->dentry;
-        struct super_block *sb = dentry->d_sb;
-        atomic_inc(&sb->s_active);
-        dput(dentry);
-        deactivate_super(sb);
-}
 static void init_cgroup_css(struct cgroup_subsys_state *css,
                               struct cgroup_subsys *ss,
                               struct cgroup *cgrp)
@@ -4035,57 +3739,40 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
        css->flags = 0;
        css->id = NULL;
        if (cgrp == dummytop)
-                css->flags |= CSS_ROOT;
+                set_bit(CSS_ROOT, &css->flags);
        BUG_ON(cgrp->subsys[ss->subsys_id]);
        cgrp->subsys[ss->subsys_id] = css;
-        /*
-         * css holds an extra ref to @cgrp->dentry which is put on the last
-         * css_put().  dput() requires process context, which css_put() may
-         * be called without.  @css->dput_work will be used to invoke
-         * dput() asynchronously from css_put().
-         */
-        INIT_WORK(&css->dput_work, css_dput_fn);
 }
-/* invoke ->post_create() on a new CSS and mark it online if successful */
+static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
-static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
-        int ret = 0;
+        /* We need to take each hierarchy_mutex in a consistent order */
+        int i;
-        lockdep_assert_held(&cgroup_mutex);
-        if (ss->css_online)
+        /*
-                ret = ss->css_online(cgrp);
+         * No worry about a race with rebind_subsystems that might mess up the
-        if (!ret)
+         * locking order, since both parties are under cgroup_mutex.
-                cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
+         */
-        return ret;
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
+                if (ss->root == root)
+                        mutex_lock(&ss->hierarchy_mutex);
+        }
 }
-/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
+static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
-static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
-        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
-        struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+        int i;
-        lockdep_assert_held(&cgroup_mutex);
-        if (!(css->flags & CSS_ONLINE))
-                return;
-        /*
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-         * css_offline() should be called with cgroup_mutex unlocked.  See
+                struct cgroup_subsys *ss = subsys[i];
-         * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
+                if (ss == NULL)
-         * details.  This temporary unlocking should go away once
+                        continue;
-         * cgroup_mutex is unexported from controllers.
+                if (ss->root == root)
-         */
+                        mutex_unlock(&ss->hierarchy_mutex);
-        if (ss->css_offline) {
-                mutex_unlock(&cgroup_mutex);
-                ss->css_offline(cgrp);
-                mutex_lock(&cgroup_mutex);
        }
-        cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
 }
 /*
@@ -4097,7 +3784,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
 * Must be called with the mutex on the parent inode held
 */
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
-                             umode_t mode)
+                             mode_t mode)
 {
        struct cgroup *cgrp;
        struct cgroupfs_root *root = parent->root;
@@ -4105,27 +3792,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        struct cgroup_subsys *ss;
        struct super_block *sb = root->sb;
-        /* allocate the cgroup and its ID, 0 is reserved for the root */
        cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
        if (!cgrp)
                return -ENOMEM;
-        cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
-        if (cgrp->id < 0)
-                goto err_free_cgrp;
-        /*
-         * Only live parents can have children.  Note that the liveliness
-         * check isn't strictly necessary because cgroup_mkdir() and
-         * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
-         * anyway so that locking is contained inside cgroup proper and we
-         * don't get nasty surprises if we ever grow another caller.
-         */
-        if (!cgroup_lock_live_group(parent)) {
-                err = -ENODEV;
-                goto err_free_id;
-        }
        /* Grab a reference on the superblock so the hierarchy doesn't
         * get deleted on unmount if there are child cgroups.  This
         * can be done outside cgroup_mutex, since the sb can't
@@ -4133,6 +3803,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         * fs */
        atomic_inc(&sb->s_active);
+        mutex_lock(&cgroup_mutex);
        init_cgroup_housekeeping(cgrp);
        cgrp->parent = parent;
@@ -4142,93 +3814,73 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
+        if (clone_children(parent))
-                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+                set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
        for_each_subsys(root, ss) {
-                struct cgroup_subsys_state *css;
+                struct cgroup_subsys_state *css = ss->create(ss, cgrp);
-                css = ss->css_alloc(cgrp);
                if (IS_ERR(css)) {
                        err = PTR_ERR(css);
-                        goto err_free_all;
+                        goto err_destroy;
                }
                init_cgroup_css(css, ss, cgrp);
                if (ss->use_id) {
                        err = alloc_css_id(ss, parent, cgrp);
                        if (err)
-                                goto err_free_all;
+                                goto err_destroy;
                }
+                /* At error, ->destroy() callback has to free assigned ID. */
+                if (clone_children(parent) && ss->post_clone)
+                        ss->post_clone(ss, cgrp);
        }
-        /*
+        cgroup_lock_hierarchy(root);
-         * Create directory.  cgroup_create_file() returns with the new
+        list_add(&cgrp->sibling, &cgrp->parent->children);
-         * directory locked on success so that it can be populated without
+        cgroup_unlock_hierarchy(root);
-         * dropping cgroup_mutex.
-         */
-        err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
-        if (err < 0)
-                goto err_free_all;
-        lockdep_assert_held(&dentry->d_inode->i_mutex);
-        /* allocation complete, commit to creation */
-        dentry->d_fsdata = cgrp;
-        cgrp->dentry = dentry;
-        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
-        list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
        root->number_of_cgroups++;
-        /* each css holds a ref to the cgroup's dentry */
+        err = cgroup_create_dir(cgrp, dentry, mode);
-        for_each_subsys(root, ss)
+        if (err < 0)
-                dget(dentry);
+                goto err_remove;
-        /* creation succeeded, notify subsystems */
+        set_bit(CGRP_RELEASABLE, &parent->flags);
-        for_each_subsys(root, ss) {
-                err = online_css(ss, cgrp);
-                if (err)
-                        goto err_destroy;
-                if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+        /* The cgroup directory was pre-locked for us */
-                    parent->parent) {
+        BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
-                        pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
-                                   current->comm, current->pid, ss->name);
-                        if (!strcmp(ss->name, "memory"))
-                                pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
-                        ss->warned_broken_hierarchy = true;
-                }
-        }
-        err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
+        err = cgroup_populate_dir(cgrp);
-        if (err)
+        /* If err < 0, we have a half-filled directory - oh well ;) */
-                goto err_destroy;
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
        return 0;
-err_free_all:
+ err_remove:
+        cgroup_lock_hierarchy(root);
+        list_del(&cgrp->sibling);
+        cgroup_unlock_hierarchy(root);
+        root->number_of_cgroups--;
+ err_destroy:
        for_each_subsys(root, ss) {
                if (cgrp->subsys[ss->subsys_id])
-                        ss->css_free(cgrp);
+                        ss->destroy(ss, cgrp);
        }
        mutex_unlock(&cgroup_mutex);
        /* Release the reference count that we took on the superblock */
        deactivate_super(sb);
-err_free_id:
-        ida_simple_remove(&root->cgroup_ida, cgrp->id);
-err_free_cgrp:
-        kfree(cgrp);
-        return err;
-err_destroy:
+        kfree(cgrp);
-        cgroup_destroy_locked(cgrp);
-        mutex_unlock(&cgroup_mutex);
-        mutex_unlock(&dentry->d_inode->i_mutex);
        return err;
 }
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
        struct cgroup *c_parent = dentry->d_parent->d_fsdata;
@@ -4236,19 +3888,18 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
-/*
- * Check the reference count on each subsystem. Since we already
- * established that there are no tasks in the cgroup, if the css refcount
- * is also 1, then there should be no outstanding references, so the
- * subsystem is safe to destroy. We scan across all subsystems rather than
- * using the per-hierarchy linked list of mounted subsystems since we can
- * be called via check_for_release() with no synchronization other than
- * RCU, and the subsystem linked list isn't RCU-safe.
- */
 static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
+        /* Check the reference count on each subsystem. Since we
+         * already established that there are no tasks in the
+         * cgroup, if the css refcount is also 1, then there should
+         * be no outstanding references, so the subsystem is safe to
+         * destroy. We scan across all subsystems rather than using
+         * the per-hierarchy linked list of mounted subsystems since
+         * we can be called via check_for_release() with no
+         * synchronization other than RCU, and the subsystem linked
+         * list isn't RCU-safe */
        int i;
        /*
         * We won't need to lock the subsys array, because the subsystems
         * we're concerned about aren't going anywhere since our cgroup root
@@ -4257,130 +3908,193 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                struct cgroup_subsys_state *css;
                /* Skip subsystems not present or not in this hierarchy */
                if (ss == NULL || ss->root != cgrp->root)
                        continue;
                css = cgrp->subsys[ss->subsys_id];
-                /*
+                /* When called from check_for_release() it's possible
-                 * When called from check_for_release() it's possible
                 * that by this point the cgroup has been removed
                 * and the css deleted. But a false-positive doesn't
                 * matter, since it can only happen if the cgroup
                 * has been deleted and hence no longer needs the
-                 * release agent to be called anyway.
+                 * release agent to be called anyway. */
-                 */
+                if (css && (atomic_read(&css->refcnt) > 1))
-                if (css && css_refcnt(css) > 1)
                        return 1;
        }
        return 0;
 }
-static int cgroup_destroy_locked(struct cgroup *cgrp)
+/*
-        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+ * Atomically mark all (or else none) of the cgroup's CSS objects as
+ * CSS_REMOVED. Return true on success, or false if the cgroup has
+ * busy subsystems. Call with cgroup_mutex held
+ */
+static int cgroup_clear_css_refs(struct cgroup *cgrp)
 {
-        struct dentry *d = cgrp->dentry;
-        struct cgroup *parent = cgrp->parent;
-        DEFINE_WAIT(wait);
-        struct cgroup_event *event, *tmp;
        struct cgroup_subsys *ss;
-        LIST_HEAD(tmp_list);
+        unsigned long flags;
+        bool failed = false;
+        local_irq_save(flags);
+        for_each_subsys(cgrp->root, ss) {
+                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+                int refcnt;
+                while (1) {
+                        /* We can only remove a CSS with a refcnt==1 */
+                        refcnt = atomic_read(&css->refcnt);
+                        if (refcnt > 1) {
+                                failed = true;
+                                goto done;
+                        }
+                        BUG_ON(!refcnt);
+                        /*
+                         * Drop the refcnt to 0 while we check other
+                         * subsystems. This will cause any racing
+                         * css_tryget() to spin until we set the
+                         * CSS_REMOVED bits or abort
+                         */
+                        if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
+                                break;
+                        cpu_relax();
+                }
+        }
+ done:
+        for_each_subsys(cgrp->root, ss) {
+                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+                if (failed) {
+                        /*
+                         * Restore old refcnt if we previously managed
+                         * to clear it from 1 to 0
+                         */
+                        if (!atomic_read(&css->refcnt))
+                                atomic_set(&css->refcnt, 1);
+                } else {
+                        /* Commit the fact that the CSS is removed */
+                        set_bit(CSS_REMOVED, &css->flags);
+                }
+        }
+        local_irq_restore(flags);
+        return !failed;
+}
-        lockdep_assert_held(&d->d_inode->i_mutex);
+/* checks if all of the css_sets attached to a cgroup have a refcount of 0.
-        lockdep_assert_held(&cgroup_mutex);
+ * Must be called with css_set_lock held */
+static int cgroup_css_sets_empty(struct cgroup *cgrp)
+{
+        struct cg_cgroup_link *link;
-        if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
+        list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
-                return -EBUSY;
+                struct css_set *cg = link->cg;
+                if (atomic_read(&cg->refcount) > 0)
+                        return 0;
+        }
-        /*
+        return 1;
-         * Block new css_tryget() by deactivating refcnt and mark @cgrp
+}
-         * removed.  This makes future css_tryget() and child creation
-         * attempts fail thus maintaining the removal conditions verified
+static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
-         * above.
+{
-         */
+        struct cgroup *cgrp = dentry->d_fsdata;
-        for_each_subsys(cgrp->root, ss) {
+        struct dentry *d;
-                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+        struct cgroup *parent;
+        DEFINE_WAIT(wait);
+        struct cgroup_event *event, *tmp;
+        int ret;
-                WARN_ON(atomic_read(&css->refcnt) < 0);
+        /* the vfs holds both inode->i_mutex already */
-                atomic_add(CSS_DEACT_BIAS, &css->refcnt);
+again:
+        mutex_lock(&cgroup_mutex);
+        if (!cgroup_css_sets_empty(cgrp)) {
+                mutex_unlock(&cgroup_mutex);
+                return -EBUSY;
        }
-        set_bit(CGRP_REMOVED, &cgrp->flags);
+        if (!list_empty(&cgrp->children)) {
+                mutex_unlock(&cgroup_mutex);
+                return -EBUSY;
+        }
+        mutex_unlock(&cgroup_mutex);
-        /* tell subsystems to initate destruction */
+        /*
-        for_each_subsys(cgrp->root, ss)
+         * In general, subsystem has no css->refcnt after pre_destroy(). But
-                offline_css(ss, cgrp);
+         * in racy cases, subsystem may have to get css->refcnt after
+         * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
+         * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
+         * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
+         * and subsystem's reference count handling. Please see css_get/put
+         * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
+         */
+        set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
        /*
-         * Put all the base refs.  Each css holds an extra reference to the
+         * Call pre_destroy handlers of subsys. Notify subsystems
-         * cgroup's dentry and cgroup removal proceeds regardless of css
+         * that rmdir() request comes.
-         * refs.  On the last put of each css, whenever that may be, the
-         * extra dentry ref is put so that dentry destruction happens only
-         * after all css's are released.
         */
-        for_each_subsys(cgrp->root, ss)
+        ret = cgroup_call_pre_destroy(cgrp);
-                css_put(cgrp->subsys[ss->subsys_id]);
+        if (ret) {
+                clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+                return ret;
+        }
+        mutex_lock(&cgroup_mutex);
+        parent = cgrp->parent;
+        if (!cgroup_css_sets_empty(cgrp) || !list_empty(&cgrp->children)) {
+                clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+                mutex_unlock(&cgroup_mutex);
+                return -EBUSY;
+        }
+        prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
+        if (!cgroup_clear_css_refs(cgrp)) {
+                mutex_unlock(&cgroup_mutex);
+                /*
+                 * Because someone may call cgroup_wakeup_rmdir_waiter() before
+                 * prepare_to_wait(), we need to check this flag.
+                 */
+                if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
+                        schedule();
+                finish_wait(&cgroup_rmdir_waitq, &wait);
+                clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+                if (signal_pending(current))
+                        return -EINTR;
+                goto again;
+        }
+        /* NO css_tryget() can success after here. */
+        finish_wait(&cgroup_rmdir_waitq, &wait);
+        clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
-        raw_spin_lock(&release_list_lock);
+        spin_lock(&release_list_lock);
+        set_bit(CGRP_REMOVED, &cgrp->flags);
        if (!list_empty(&cgrp->release_list))
                list_del_init(&cgrp->release_list);
-        raw_spin_unlock(&release_list_lock);
+        spin_unlock(&release_list_lock);
+        cgroup_lock_hierarchy(cgrp->root);
        /* delete this cgroup from parent->children */
-        list_del_rcu(&cgrp->sibling);
+        list_del_init(&cgrp->sibling);
-        list_del_init(&cgrp->allcg_node);
+        cgroup_unlock_hierarchy(cgrp->root);
+        d = dget(cgrp->dentry);
-        dget(d);
        cgroup_d_remove_dir(d);
        dput(d);
-        set_bit(CGRP_RELEASABLE, &parent->flags);
        check_for_release(parent);
        /*
         * Unregister events and notify userspace.
         * Notify userspace about cgroup removing only after rmdir of cgroup
-         * directory to avoid race between userspace and kernelspace. Use
+         * directory to avoid race between userspace and kernelspace
-         * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
-         * cgroup_event_wake() is called with the wait queue head locked,
-         * remove_wait_queue() cannot be called while holding event_list_lock.
         */
        spin_lock(&cgrp->event_list_lock);
-        list_splice_init(&cgrp->event_list, &tmp_list);
+        list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
-        spin_unlock(&cgrp->event_list_lock);
+                list_del(&event->list);
-        list_for_each_entry_safe(event, tmp, &tmp_list, list) {
-                list_del_init(&event->list);
                remove_wait_queue(event->wqh, &event->wait);
                eventfd_signal(event->eventfd, 1);
                schedule_work(&event->remove);
        }
+        spin_unlock(&cgrp->event_list_lock);
-        return 0;
-}
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
-{
-        int ret;
-        mutex_lock(&cgroup_mutex);
-        ret = cgroup_destroy_locked(dentry->d_fsdata);
        mutex_unlock(&cgroup_mutex);
+        return 0;
-        return ret;
-}
-static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
-{
-        INIT_LIST_HEAD(&ss->cftsets);
-        /*
-         * base_cftset is embedded in subsys itself, no need to worry about
-         * deregistration.
-         */
-        if (ss->base_cftypes) {
-                ss->base_cftset.cfts = ss->base_cftypes;
-                list_add_tail(&ss->base_cftset.node, &ss->cftsets);
-        }
 }
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
@@ -4389,15 +4103,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
-        mutex_lock(&cgroup_mutex);
-        /* init base cftset */
-        cgroup_init_cftsets(ss);
        /* Create the top cgroup state for this subsystem */
        list_add(&ss->sibling, &rootnode.subsys_list);
        ss->root = &rootnode;
-        css = ss->css_alloc(dummytop);
+        css = ss->create(ss, dummytop);
        /* We don't handle early failures gracefully */
        BUG_ON(IS_ERR(css));
        init_cgroup_css(css, ss, dummytop);
@@ -4406,7 +4115,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         * pointer to this state - since the subsystem is
         * newly registered, all tasks and hence the
         * init_css_set is in the subsystem's top cgroup. */
-        init_css_set.subsys[ss->subsys_id] = css;
+        init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
        need_forkexit_callback |= ss->fork || ss->exit;
@@ -4415,10 +4124,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         * need to invoke fork callbacks here. */
        BUG_ON(!list_empty(&init_task.tasks));
+        mutex_init(&ss->hierarchy_mutex);
+        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
        ss->active = 1;
-        BUG_ON(online_css(ss, dummytop));
-        mutex_unlock(&cgroup_mutex);
        /* this function shouldn't be used with modular subsystems, since they
         * need to register a subsys_id, among other things */
@@ -4436,12 +4144,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 */
 int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 {
+        int i;
        struct cgroup_subsys_state *css;
-        int i, ret;
        /* check name and function validity */
        if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
-            ss->css_alloc == NULL || ss->css_free == NULL)
+            ss->create == NULL || ss->destroy == NULL)
                return -EINVAL;
        /*
@@ -4458,26 +4166,39 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         * since cgroup_init_subsys will have already taken care of it.
         */
        if (ss->module == NULL) {
-                /* a sanity check */
+                /* a few sanity checks */
+                BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
                BUG_ON(subsys[ss->subsys_id] != ss);
                return 0;
        }
-        /* init base cftset */
+        /*
-        cgroup_init_cftsets(ss);
+         * need to register a subsys id before anything else - for example,
+         * init_cgroup_css needs it.
+         */
        mutex_lock(&cgroup_mutex);
-        subsys[ss->subsys_id] = ss;
+        /* find the first empty slot in the array */
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                if (subsys[i] == NULL)
+                        break;
+        }
+        if (i == CGROUP_SUBSYS_COUNT) {
+                /* maximum number of subsystems already registered! */
+                mutex_unlock(&cgroup_mutex);
+                return -EBUSY;
+        }
+        /* assign ourselves the subsys_id */
+        ss->subsys_id = i;
+        subsys[i] = ss;
        /*
-         * no ss->css_alloc seems to need anything important in the ss
+         * no ss->create seems to need anything important in the ss struct, so
-         * struct, so this can happen first (i.e. before the rootnode
+         * this can happen first (i.e. before the rootnode attachment).
-         * attachment).
         */
-        css = ss->css_alloc(dummytop);
+        css = ss->create(ss, dummytop);
        if (IS_ERR(css)) {
                /* failure case - need to deassign the subsys[] slot. */
-                subsys[ss->subsys_id] = NULL;
+                subsys[i] = NULL;
                mutex_unlock(&cgroup_mutex);
                return PTR_ERR(css);
        }
@@ -4489,9 +4210,14 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        init_cgroup_css(css, ss, dummytop);
        /* init_idr must be after init_cgroup_css because it sets css->id. */
        if (ss->use_id) {
-                ret = cgroup_init_idr(ss, css);
+                int ret = cgroup_init_idr(ss, css);
-                if (ret)
+                if (ret) {
-                        goto err_unload;
+                        dummytop->subsys[ss->subsys_id] = NULL;
+                        ss->destroy(ss, dummytop);
+                        subsys[i] = NULL;
+                        mutex_unlock(&cgroup_mutex);
+                        return ret;
+                }
        }
        /*
@@ -4523,20 +4249,13 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        }
        write_unlock(&css_set_lock);
+        mutex_init(&ss->hierarchy_mutex);
+        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
        ss->active = 1;
-        ret = online_css(ss, dummytop);
-        if (ret)
-                goto err_unload;
        /* success! */
        mutex_unlock(&cgroup_mutex);
        return 0;
-err_unload:
-        mutex_unlock(&cgroup_mutex);
-        /* @ss can't be mounted here as try_module_get() would fail */
-        cgroup_unload_subsys(ss);
-        return ret;
 }
 EXPORT_SYMBOL_GPL(cgroup_load_subsys);
@@ -4563,16 +4282,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        BUG_ON(ss->root != &rootnode);
        mutex_lock(&cgroup_mutex);
-        offline_css(ss, dummytop);
-        ss->active = 0;
-        if (ss->use_id) {
-                idr_remove_all(&ss->idr);
-                idr_destroy(&ss->idr);
-        }
        /* deassign the subsys_id */
+        BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
        subsys[ss->subsys_id] = NULL;
        /* remove subsystem from rootnode's list of subsystems */
@@ -4587,6 +4298,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
                struct css_set *cg = link->cg;
                hlist_del(&cg->hlist);
+                BUG_ON(!cg->subsys[ss->subsys_id]);
                cg->subsys[ss->subsys_id] = NULL;
                hhead = css_set_hash(cg->subsys);
                hlist_add_head(&cg->hlist, hhead);
@@ -4594,12 +4306,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        write_unlock(&css_set_lock);
        /*
-         * remove subsystem's css from the dummytop and free it - need to
+         * remove subsystem's css from the dummytop and free it - need to free
-         * free before marking as null because ss->css_free needs the
+         * before marking as null because ss->destroy needs the cgrp->subsys
-         * cgrp->subsys pointer to find their state. note that this also
+         * pointer to find their state. note that this also takes care of
-         * takes care of freeing the css_id.
+         * freeing the css_id.
         */
-        ss->css_free(dummytop);
+        ss->destroy(ss, dummytop);
        dummytop->subsys[ss->subsys_id] = NULL;
        mutex_unlock(&cgroup_mutex);
@@ -4634,17 +4346,14 @@ int __init cgroup_init_early(void)
        for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
                INIT_HLIST_HEAD(&css_set_table[i]);
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
-                /* at bootup time, we don't worry about modular subsystems */
-                if (!ss || ss->module)
-                        continue;
                BUG_ON(!ss->name);
                BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
-                BUG_ON(!ss->css_alloc);
+                BUG_ON(!ss->create);
-                BUG_ON(!ss->css_free);
+                BUG_ON(!ss->destroy);
                if (ss->subsys_id != i) {
                        printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
                               ss->name, ss->subsys_id);
@@ -4673,12 +4382,9 @@ int __init cgroup_init(void)
        if (err)
                return err;
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
-                /* at bootup time, we don't worry about modular subsystems */
-                if (!ss || ss->module)
-                        continue;
                if (!ss->early_init)
                        cgroup_init_subsys(ss);
                if (ss->use_id)
@@ -4851,30 +4557,41 @@ void cgroup_fork(struct task_struct *child)
 }
 /**
+ * cgroup_fork_callbacks - run fork callbacks
+ * @child: the new task
+ *
+ * Called on a new task very soon before adding it to the
+ * tasklist. No need to take any locks since no-one can
+ * be operating on this task.
+ */
+void cgroup_fork_callbacks(struct task_struct *child)
+{
+        if (need_forkexit_callback) {
+                int i;
+                /*
+                 * forkexit callbacks are only supported for builtin
+                 * subsystems, and the builtin section of the subsys array is
+                 * immutable, so we don't need to lock the subsys array here.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        if (ss->fork)
+                                ss->fork(ss, child);
+                }
+        }
+}
+/**
 * cgroup_post_fork - called on a new task after adding it to the task list
 * @child: the task in question
 *
- * Adds the task to the list running through its css_set if necessary and
+ * Adds the task to the list running through its css_set if necessary.
- * call the subsystem fork() callbacks.  Has to be after the task is
+ * Has to be after the task is visible on the task list in case we race
- * visible on the task list in case we race with the first call to
+ * with the first call to cgroup_iter_start() - to guarantee that the
- * cgroup_iter_start() - to guarantee that the new task ends up on its
+ * new task ends up on its list.
- * list.
 */
 void cgroup_post_fork(struct task_struct *child)
 {
-        int i;
-        /*
-         * use_task_css_set_links is set to 1 before we walk the tasklist
-         * under the tasklist_lock and we read it here after we added the child
-         * to the tasklist under the tasklist_lock as well. If the child wasn't
-         * yet in the tasklist when we walked through it from
-         * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
-         * should be visible now due to the paired locking and barriers implied
-         * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
-         * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
-         * lock on fork.
-         */
        if (use_task_css_set_links) {
                write_lock(&css_set_lock);
                task_lock(child);
@@ -4883,30 +4600,7 @@ void cgroup_post_fork(struct task_struct *child)
                task_unlock(child);
                write_unlock(&css_set_lock);
        }
-        /*
-         * Call ss->fork().  This must happen after @child is linked on
-         * css_set; otherwise, @child might change state between ->fork()
-         * and addition to css_set.
-         */
-        if (need_forkexit_callback) {
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                        struct cgroup_subsys *ss = subsys[i];
-                        /*
-                         * fork/exit callbacks are supported only for
-                         * builtin subsystems and we don't need further
-                         * synchronization as they never go away.
-                         */
-                        if (!ss || ss->module)
-                                continue;
-                        if (ss->fork)
-                                ss->fork(child);
-                }
-        }
 }
 /**
 * cgroup_exit - detach cgroup from exiting task
 * @tsk: pointer to task_struct of exiting process
@@ -4965,25 +4659,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        tsk->cgroups = &init_css_set;
        if (run_callbacks && need_forkexit_callback) {
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * modular subsystems can't use callbacks, so no need to lock
+                 * the subsys array
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
-                        /* modular subsystems can't use callbacks */
-                        if (!ss || ss->module)
-                                continue;
                        if (ss->exit) {
                                struct cgroup *old_cgrp =
                                        rcu_dereference_raw(cg->subsys[i])->cgroup;
                                struct cgroup *cgrp = task_cgroup(tsk, i);
-                                ss->exit(cgrp, old_cgrp, tsk);
+                                ss->exit(ss, cgrp, old_cgrp, tsk);
                        }
                }
        }
        task_unlock(tsk);
        if (cg)
-                put_css_set_taskexit(cg);
+                put_css_set(cg);
 }
 /**
@@ -5024,56 +4717,39 @@ static void check_for_release(struct cgroup *cgrp)
                 * already queued for a userspace notification, queue
                 * it now */
                int need_schedule_work = 0;
-                raw_spin_lock(&release_list_lock);
+                spin_lock(&release_list_lock);
                if (!cgroup_is_removed(cgrp) &&
                    list_empty(&cgrp->release_list)) {
                        list_add(&cgrp->release_list, &release_list);
                        need_schedule_work = 1;
                }
-                raw_spin_unlock(&release_list_lock);
+                spin_unlock(&release_list_lock);
                if (need_schedule_work)
                        schedule_work(&release_agent_work);
        }
 }
 /* Caller must verify that the css is not for root cgroup */
-bool __css_tryget(struct cgroup_subsys_state *css)
+void __css_get(struct cgroup_subsys_state *css, int count)
 {
-        while (true) {
+        atomic_add(count, &css->refcnt);
-                int t, v;
+        set_bit(CGRP_RELEASABLE, &css->cgroup->flags);
-                v = css_refcnt(css);
-                t = atomic_cmpxchg(&css->refcnt, v, v + 1);
-                if (likely(t == v))
-                        return true;
-                else if (t < 0)
-                        return false;
-                cpu_relax();
-        }
 }
-EXPORT_SYMBOL_GPL(__css_tryget);
+EXPORT_SYMBOL_GPL(__css_get);
 /* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css)
+void __css_put(struct cgroup_subsys_state *css, int count)
 {
        struct cgroup *cgrp = css->cgroup;
-        int v;
+        int val;
        rcu_read_lock();
-        v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
+        val = atomic_sub_return(count, &css->refcnt);
+        if (val == 1) {
-        switch (v) {
+                check_for_release(cgrp);
-        case 1:
+                cgroup_wakeup_rmdir_waiter(cgrp);
-                if (notify_on_release(cgrp)) {
-                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
-                        check_for_release(cgrp);
-                }
-                break;
-        case 0:
-                schedule_work(&css->dput_work);
-                break;
        }
        rcu_read_unlock();
+        WARN_ON_ONCE(val < 1);
 }
 EXPORT_SYMBOL_GPL(__css_put);
@@ -5104,7 +4780,7 @@ static void cgroup_release_agent(struct work_struct *work)
 {
        BUG_ON(work != &release_agent_work);
        mutex_lock(&cgroup_mutex);
-        raw_spin_lock(&release_list_lock);
+        spin_lock(&release_list_lock);
        while (!list_empty(&release_list)) {
                char *argv[3], *envp[3];
                int i;
@@ -5113,7 +4789,7 @@ static void cgroup_release_agent(struct work_struct *work)
                                                    struct cgroup,
                                                    release_list);
                list_del_init(&cgrp->release_list);
-                raw_spin_unlock(&release_list_lock);
+                spin_unlock(&release_list_lock);
                pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
                if (!pathbuf)
                        goto continue_free;
@@ -5143,9 +4819,9 @@ static void cgroup_release_agent(struct work_struct *work)
 continue_free:
                kfree(pathbuf);
                kfree(agentbuf);
-                raw_spin_lock(&release_list_lock);
+                spin_lock(&release_list_lock);
        }
-        raw_spin_unlock(&release_list_lock);
+        spin_unlock(&release_list_lock);
        mutex_unlock(&cgroup_mutex);
 }
@@ -5157,17 +4833,13 @@ static int __init cgroup_disable(char *str)
        while ((token = strsep(&str, ",")) != NULL) {
                if (!*token)
                        continue;
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * cgroup_disable, being at boot time, can't know about module
+                 * subsystems, so we don't worry about them.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
-                        /*
-                         * cgroup_disable, being at boot time, can't
-                         * know about module subsystems, so we don't
-                         * worry about them.
-                         */
-                        if (!ss || ss->module)
-                                continue;
                        if (!strcmp(token, ss->name)) {
                                ss->disabled = 1;
                                printk(KERN_INFO "Disabling %s control group"
@@ -5196,7 +4868,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
         * on this or this is under rcu_read_lock(). Once css->id is allocated,
         * it's unchanged until freed.
         */
-        cssid = rcu_dereference_check(css->id, css_refcnt(css));
+        cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
        if (cssid)
                return cssid->id;
@@ -5208,7 +4880,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 {
        struct css_id *cssid;
-        cssid = rcu_dereference_check(css->id, css_refcnt(css));
+        cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
        if (cssid)
                return cssid->depth;
@@ -5222,7 +4894,7 @@ EXPORT_SYMBOL_GPL(css_depth);
 * @root: the css supporsed to be an ancestor of the child.
 *
 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
- * this function reads css->id, the caller must hold rcu_read_lock().
+ * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
 * But, considering usual usage, the csses should be valid objects after test.
 * Assuming that the caller will do some action to the child if this returns
 * returns true, the caller must take "child";s reference count.
@@ -5234,18 +4906,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
 {
        struct css_id *child_id;
        struct css_id *root_id;
+        bool ret = true;
+        rcu_read_lock();
        child_id  = rcu_dereference(child->id);
-        if (!child_id)
-                return false;
        root_id = rcu_dereference(root->id);
-        if (!root_id)
+        if (!child_id
-                return false;
+            || !root_id
-        if (child_id->depth < root_id->depth)
+            || (child_id->depth < root_id->depth)
-                return false;
+            || (child_id->stack[root_id->depth] != root_id->id))
-        if (child_id->stack[root_id->depth] != root_id->id)
+                ret = false;
-                return false;
+        rcu_read_unlock();
-        return true;
+        return ret;
 }
 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
@@ -5407,8 +5079,6 @@ css_get_next(struct cgroup_subsys *ss, int id,
                return NULL;
        BUG_ON(!ss->use_id);
-        WARN_ON_ONCE(!rcu_read_lock_held());
        /* fill start point for scan */
        tmpid = id;
        while (1) {
@@ -5416,7 +5086,10 @@ css_get_next(struct cgroup_subsys *ss, int id,
                 * scan next entry from bitmap(tree), tmpid is updated after
                 * idr_get_next().
                 */
+                spin_lock(&ss->id_lock);
                tmp = idr_get_next(&ss->idr, &tmpid);
+                spin_unlock(&ss->id_lock);
                if (!tmp)
                        break;
                if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
@@ -5456,7 +5129,8 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
 }
 #ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
+static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
+                                                   struct cgroup *cont)
 {
        struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
@@ -5466,7 +5140,7 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
        return css;
 }
-static void debug_css_free(struct cgroup *cont)
+static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 {
        kfree(cont->subsys[debug_subsys_id]);
 }
@@ -5589,15 +5263,19 @@ static struct cftype debug_files[] =  {
                .name = "releasable",
                .read_u64 = releasable_read,
        },
-        { }     /* terminate */
 };
+static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+        return cgroup_add_files(cont, ss, debug_files,
+                                ARRAY_SIZE(debug_files));
+}
 struct cgroup_subsys debug_subsys = {
        .name = "debug",
-        .css_alloc = debug_css_alloc,
+        .create = debug_create,
-        .css_free = debug_css_free,
+        .destroy = debug_destroy,
+        .populate = debug_populate,
        .subsys_id = debug_subsys_id,
-        .base_cftypes = debug_files,
 };
 #endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 75dda1ea502..a3f638ac3de 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -14,7 +14,7 @@
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
@@ -22,33 +22,24 @@
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
-/*
+enum freezer_state {
- * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is
+        CGROUP_THAWED = 0,
- * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
+        CGROUP_FREEZING,
- * for "THAWED".  FREEZING_PARENT is set if the parent freezer is FREEZING
+        CGROUP_FROZEN,
- * for whatever reason.  IOW, a cgroup has FREEZING_PARENT set if one of
- * its ancestors has FREEZING_SELF set.
- */
-enum freezer_state_flags {
-        CGROUP_FREEZER_ONLINE   = (1 << 0), /* freezer is fully online */
-        CGROUP_FREEZING_SELF    = (1 << 1), /* this freezer is freezing */
-        CGROUP_FREEZING_PARENT  = (1 << 2), /* the parent freezer is freezing */
-        CGROUP_FROZEN           = (1 << 3), /* this and its descendants frozen */
-        /* mask for all FREEZING flags */
-        CGROUP_FREEZING         = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
 };
 struct freezer {
-        struct cgroup_subsys_state      css;
+        struct cgroup_subsys_state css;
-        unsigned int                    state;
+        enum freezer_state state;
-        spinlock_t                      lock;
+        spinlock_t lock; /* protects _writes_ to state */
 };
-static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
+static inline struct freezer *cgroup_freezer(
+                struct cgroup *cgroup)
 {
-        return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
+        return container_of(
-                            struct freezer, css);
+                cgroup_subsys_state(cgroup, freezer_subsys_id),
+                struct freezer, css);
 }
 static inline struct freezer *task_freezer(struct task_struct *task)
@@ -57,42 +48,93 @@ static inline struct freezer *task_freezer(struct task_struct *task)
                            struct freezer, css);
 }
-static struct freezer *parent_freezer(struct freezer *freezer)
+static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
 {
-        struct cgroup *pcg = freezer->css.cgroup->parent;
+        enum freezer_state state = task_freezer(task)->state;
+        return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
-        if (pcg)
-                return cgroup_freezer(pcg);
-        return NULL;
 }
-bool cgroup_freezing(struct task_struct *task)
+int cgroup_freezing_or_frozen(struct task_struct *task)
 {
-        bool ret;
+        int result;
+        task_lock(task);
-        rcu_read_lock();
+        result = __cgroup_freezing_or_frozen(task);
-        ret = task_freezer(task)->state & CGROUP_FREEZING;
+        task_unlock(task);
-        rcu_read_unlock();
+        return result;
-        return ret;
 }
 /*
 * cgroups_write_string() limits the size of freezer state strings to
 * CGROUP_LOCAL_BUFFER_SIZE
 */
-static const char *freezer_state_strs(unsigned int state)
+static const char *freezer_state_strs[] = {
-{
+        "THAWED",
-        if (state & CGROUP_FROZEN)
+        "FREEZING",
-                return "FROZEN";
+        "FROZEN",
-        if (state & CGROUP_FREEZING)
-                return "FREEZING";
-        return "THAWED";
 };
+/*
+ * State diagram
+ * Transitions are caused by userspace writes to the freezer.state file.
+ * The values in parenthesis are state labels. The rest are edge labels.
+ *
+ * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
+ *    ^ ^                    |                     |
+ *    | \_______THAWED_______/                     |
+ *    \__________________________THAWED____________/
+ */
 struct cgroup_subsys freezer_subsys;
-static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
+/* Locks taken and their ordering
+ * ------------------------------
+ * cgroup_mutex (AKA cgroup_lock)
+ * freezer->lock
+ * css_set_lock
+ * task->alloc_lock (AKA task_lock)
+ * task->sighand->siglock
+ *
+ * cgroup code forces css_set_lock to be taken before task->alloc_lock
+ *
+ * freezer_create(), freezer_destroy():
+ * cgroup_mutex [ by cgroup core ]
+ *
+ * freezer_can_attach():
+ * cgroup_mutex (held by caller of can_attach)
+ *
+ * cgroup_freezing_or_frozen():
+ * task->alloc_lock (to get task's cgroup)
+ *
+ * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
+ * freezer->lock
+ *  sighand->siglock (if the cgroup is freezing)
+ *
+ * freezer_read():
+ * cgroup_mutex
+ *  freezer->lock
+ *   write_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock
+ *   read_lock css_set_lock (cgroup iterator start)
+ *
+ * freezer_write() (freeze):
+ * cgroup_mutex
+ *  freezer->lock
+ *   write_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock
+ *   read_lock css_set_lock (cgroup iterator start)
+ *    sighand->siglock (fake signal delivery inside freeze_task())
+ *
+ * freezer_write() (unfreeze):
+ * cgroup_mutex
+ *  freezer->lock
+ *   write_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock
+ *   read_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
+ *     sighand->siglock
+ */
+static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
+                                                  struct cgroup *cgroup)
 {
        struct freezer *freezer;
@@ -101,388 +143,255 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
                return ERR_PTR(-ENOMEM);
        spin_lock_init(&freezer->lock);
+        freezer->state = CGROUP_THAWED;
        return &freezer->css;
 }
-/**
+static void freezer_destroy(struct cgroup_subsys *ss,
- * freezer_css_online - commit creation of a freezer cgroup
+                            struct cgroup *cgroup)
- * @cgroup: cgroup being created
- *
- * We're committing to creation of @cgroup.  Mark it online and inherit
- * parent's freezing state while holding both parent's and our
- * freezer->lock.
- */
-static int freezer_css_online(struct cgroup *cgroup)
 {
-        struct freezer *freezer = cgroup_freezer(cgroup);
+        kfree(cgroup_freezer(cgroup));
-        struct freezer *parent = parent_freezer(freezer);
-        /*
-         * The following double locking and freezing state inheritance
-         * guarantee that @cgroup can never escape ancestors' freezing
-         * states.  See cgroup_for_each_descendant_pre() for details.
-         */
-        if (parent)
-                spin_lock_irq(&parent->lock);
-        spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
-        freezer->state |= CGROUP_FREEZER_ONLINE;
-        if (parent && (parent->state & CGROUP_FREEZING)) {
-                freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
-                atomic_inc(&system_freezing_cnt);
-        }
-        spin_unlock(&freezer->lock);
-        if (parent)
-                spin_unlock_irq(&parent->lock);
-        return 0;
-}
-/**
- * freezer_css_offline - initiate destruction of @cgroup
- * @cgroup: cgroup being destroyed
- *
- * @cgroup is going away.  Mark it dead and decrement system_freezing_count
- * if it was holding one.
- */
-static void freezer_css_offline(struct cgroup *cgroup)
-{
-        struct freezer *freezer = cgroup_freezer(cgroup);
-        spin_lock_irq(&freezer->lock);
-        if (freezer->state & CGROUP_FREEZING)
-                atomic_dec(&system_freezing_cnt);
-        freezer->state = 0;
-        spin_unlock_irq(&freezer->lock);
 }
-static void freezer_css_free(struct cgroup *cgroup)
+/* task is frozen or will freeze immediately when next it gets woken */
+static bool is_task_frozen_enough(struct task_struct *task)
 {
-        kfree(cgroup_freezer(cgroup));
+        return frozen(task) ||
+                (task_is_stopped_or_traced(task) && freezing(task));
 }
 /*
- * Tasks can be migrated into a different freezer anytime regardless of its
+ * The call to cgroup_lock() in the freezer.state write method prevents
- * current state.  freezer_attach() is responsible for making new tasks
+ * a write to that file racing against an attach, and hence the
- * conform to the current state.
+ * can_attach() result will remain valid until the attach completes.
- *
- * Freezer state changes and task migration are synchronized via
- * @freezer->lock.  freezer_attach() makes the new tasks conform to the
- * current state and all following state changes can see the new tasks.
 */
-static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset)
+static int freezer_can_attach(struct cgroup_subsys *ss,
+                              struct cgroup *new_cgroup,
+                              struct task_struct *task)
 {
-        struct freezer *freezer = cgroup_freezer(new_cgrp);
+        struct freezer *freezer;
-        struct task_struct *task;
-        bool clear_frozen = false;
-        spin_lock_irq(&freezer->lock);
        /*
-         * Make the new tasks conform to the current state of @new_cgrp.
+         * Anything frozen can't move or be moved to/from.
-         * For simplicity, when migrating any task to a FROZEN cgroup, we
-         * revert it to FREEZING and let update_if_frozen() determine the
-         * correct state later.
-         *
-         * Tasks in @tset are on @new_cgrp but may not conform to its
-         * current state before executing the following - !frozen tasks may
-         * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
         */
-        cgroup_taskset_for_each(task, new_cgrp, tset) {
-                if (!(freezer->state & CGROUP_FREEZING)) {
-                        __thaw_task(task);
-                } else {
-                        freeze_task(task);
-                        freezer->state &= ~CGROUP_FROZEN;
-                        clear_frozen = true;
-                }
-        }
-        spin_unlock_irq(&freezer->lock);
+        freezer = cgroup_freezer(new_cgroup);
+        if (freezer->state != CGROUP_THAWED)
+                return -EBUSY;
-        /*
+        return 0;
-         * Propagate FROZEN clearing upwards.  We may race with
+}
-         * update_if_frozen(), but as long as both work bottom-up, either
-         * update_if_frozen() sees child's FROZEN cleared or we clear the
+static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-         * parent's FROZEN later.  No parent w/ !FROZEN children can be
+{
-         * left FROZEN.
+        rcu_read_lock();
-         */
+        if (__cgroup_freezing_or_frozen(tsk)) {
-        while (clear_frozen && (freezer = parent_freezer(freezer))) {
+                rcu_read_unlock();
-                spin_lock_irq(&freezer->lock);
+                return -EBUSY;
-                freezer->state &= ~CGROUP_FROZEN;
-                clear_frozen = freezer->state & CGROUP_FREEZING;
-                spin_unlock_irq(&freezer->lock);
        }
+        rcu_read_unlock();
+        return 0;
 }
-static void freezer_fork(struct task_struct *task)
+static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
 {
        struct freezer *freezer;
+        /*
+         * No lock is needed, since the task isn't on tasklist yet,
+         * so it can't be moved to another cgroup, which means the
+         * freezer won't be removed and will be valid during this
+         * function call.  Nevertheless, apply RCU read-side critical
+         * section to suppress RCU lockdep false positives.
+         */
        rcu_read_lock();
        freezer = task_freezer(task);
+        rcu_read_unlock();
        /*
         * The root cgroup is non-freezable, so we can skip the
         * following check.
         */
        if (!freezer->css.cgroup->parent)
-                goto out;
+                return;
        spin_lock_irq(&freezer->lock);
-        if (freezer->state & CGROUP_FREEZING)
+        BUG_ON(freezer->state == CGROUP_FROZEN);
-                freeze_task(task);
+        /* Locking avoids race with FREEZING -> THAWED transitions. */
+        if (freezer->state == CGROUP_FREEZING)
+                freeze_task(task, true);
        spin_unlock_irq(&freezer->lock);
-out:
-        rcu_read_unlock();
 }
-/**
+/*
- * update_if_frozen - update whether a cgroup finished freezing
+ * caller must hold freezer->lock
- * @cgroup: cgroup of interest
- *
- * Once FREEZING is initiated, transition to FROZEN is lazily updated by
- * calling this function.  If the current state is FREEZING but not FROZEN,
- * this function checks whether all tasks of this cgroup and the descendant
- * cgroups finished freezing and, if so, sets FROZEN.
- *
- * The caller is responsible for grabbing RCU read lock and calling
- * update_if_frozen() on all descendants prior to invoking this function.
- *
- * Task states and freezer state might disagree while tasks are being
- * migrated into or out of @cgroup, so we can't verify task states against
- * @freezer state here.  See freezer_attach() for details.
 */
-static void update_if_frozen(struct cgroup *cgroup)
+static void update_if_frozen(struct cgroup *cgroup,
+                                 struct freezer *freezer)
 {
-        struct freezer *freezer = cgroup_freezer(cgroup);
-        struct cgroup *pos;
        struct cgroup_iter it;
        struct task_struct *task;
+        unsigned int nfrozen = 0, ntotal = 0;
+        enum freezer_state old_state = freezer->state;
-        WARN_ON_ONCE(!rcu_read_lock_held());
-        spin_lock_irq(&freezer->lock);
-        if (!(freezer->state & CGROUP_FREEZING) ||
-            (freezer->state & CGROUP_FROZEN))
-                goto out_unlock;
-        /* are all (live) children frozen? */
-        cgroup_for_each_child(pos, cgroup) {
-                struct freezer *child = cgroup_freezer(pos);
-                if ((child->state & CGROUP_FREEZER_ONLINE) &&
-                    !(child->state & CGROUP_FROZEN))
-                        goto out_unlock;
-        }
-        /* are all tasks frozen? */
        cgroup_iter_start(cgroup, &it);
        while ((task = cgroup_iter_next(cgroup, &it))) {
-                if (freezing(task)) {
+                ntotal++;
-                        /*
+                if (is_task_frozen_enough(task))
-                         * freezer_should_skip() indicates that the task
+                        nfrozen++;
-                         * should be skipped when determining freezing
+        }
-                         * completion.  Consider it frozen in addition to
-                         * the usual frozen condition.
+        if (old_state == CGROUP_THAWED) {
-                         */
+                BUG_ON(nfrozen > 0);
-                        if (!frozen(task) && !freezer_should_skip(task))
+        } else if (old_state == CGROUP_FREEZING) {
-                                goto out_iter_end;
+                if (nfrozen == ntotal)
-                }
+                        freezer->state = CGROUP_FROZEN;
+        } else { /* old_state == CGROUP_FROZEN */
+                BUG_ON(nfrozen != ntotal);
        }
-        freezer->state |= CGROUP_FROZEN;
-out_iter_end:
        cgroup_iter_end(cgroup, &it);
-out_unlock:
-        spin_unlock_irq(&freezer->lock);
 }
 static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
                        struct seq_file *m)
 {
-        struct cgroup *pos;
+        struct freezer *freezer;
+        enum freezer_state state;
-        rcu_read_lock();
-        /* update states bottom-up */
+        if (!cgroup_lock_live_group(cgroup))
-        cgroup_for_each_descendant_post(pos, cgroup)
+                return -ENODEV;
-                update_if_frozen(pos);
-        update_if_frozen(cgroup);
-        rcu_read_unlock();
+        freezer = cgroup_freezer(cgroup);
+        spin_lock_irq(&freezer->lock);
+        state = freezer->state;
+        if (state == CGROUP_FREEZING) {
+                /* We change from FREEZING to FROZEN lazily if the cgroup was
+                 * only partially frozen when we exitted write. */
+                update_if_frozen(cgroup, freezer);
+                state = freezer->state;
+        }
+        spin_unlock_irq(&freezer->lock);
+        cgroup_unlock();
-        seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state));
+        seq_puts(m, freezer_state_strs[state]);
        seq_putc(m, '\n');
        return 0;
 }
-static void freeze_cgroup(struct freezer *freezer)
+static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 {
-        struct cgroup *cgroup = freezer->css.cgroup;
        struct cgroup_iter it;
        struct task_struct *task;
+        unsigned int num_cant_freeze_now = 0;
+        freezer->state = CGROUP_FREEZING;
        cgroup_iter_start(cgroup, &it);
-        while ((task = cgroup_iter_next(cgroup, &it)))
+        while ((task = cgroup_iter_next(cgroup, &it))) {
-                freeze_task(task);
+                if (!freeze_task(task, true))
+                        continue;
+                if (is_task_frozen_enough(task))
+                        continue;
+                if (!freezing(task) && !freezer_should_skip(task))
+                        num_cant_freeze_now++;
+        }
        cgroup_iter_end(cgroup, &it);
+        return num_cant_freeze_now ? -EBUSY : 0;
 }
-static void unfreeze_cgroup(struct freezer *freezer)
+static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
 {
-        struct cgroup *cgroup = freezer->css.cgroup;
        struct cgroup_iter it;
        struct task_struct *task;
        cgroup_iter_start(cgroup, &it);
-        while ((task = cgroup_iter_next(cgroup, &it)))
+        while ((task = cgroup_iter_next(cgroup, &it))) {
-                __thaw_task(task);
+                thaw_process(task);
+        }
        cgroup_iter_end(cgroup, &it);
+        freezer->state = CGROUP_THAWED;
 }
-/**
+static int freezer_change_state(struct cgroup *cgroup,
- * freezer_apply_state - apply state change to a single cgroup_freezer
+                                enum freezer_state goal_state)
- * @freezer: freezer to apply state change to
- * @freeze: whether to freeze or unfreeze
- * @state: CGROUP_FREEZING_* flag to set or clear
- *
- * Set or clear @state on @cgroup according to @freeze, and perform
- * freezing or thawing as necessary.
- */
-static void freezer_apply_state(struct freezer *freezer, bool freeze,
-                                unsigned int state)
 {
-        /* also synchronizes against task migration, see freezer_attach() */
+        struct freezer *freezer;
-        lockdep_assert_held(&freezer->lock);
+        int retval = 0;
-        if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+        freezer = cgroup_freezer(cgroup);
-                return;
-        if (freeze) {
+        spin_lock_irq(&freezer->lock);
-                if (!(freezer->state & CGROUP_FREEZING))
-                        atomic_inc(&system_freezing_cnt);
-                freezer->state |= state;
-                freeze_cgroup(freezer);
-        } else {
-                bool was_freezing = freezer->state & CGROUP_FREEZING;
-                freezer->state &= ~state;
-                if (!(freezer->state & CGROUP_FREEZING)) {
-                        if (was_freezing)
-                                atomic_dec(&system_freezing_cnt);
-                        freezer->state &= ~CGROUP_FROZEN;
-                        unfreeze_cgroup(freezer);
-                }
-        }
-}
-/**
+        update_if_frozen(cgroup, freezer);
- * freezer_change_state - change the freezing state of a cgroup_freezer
+        if (goal_state == freezer->state)
- * @freezer: freezer of interest
+                goto out;
- * @freeze: whether to freeze or thaw
- *
- * Freeze or thaw @freezer according to @freeze.  The operations are
- * recursive - all descendants of @freezer will be affected.
- */
-static void freezer_change_state(struct freezer *freezer, bool freeze)
-{
-        struct cgroup *pos;
-        /* update @freezer */
+        switch (goal_state) {
-        spin_lock_irq(&freezer->lock);
+        case CGROUP_THAWED:
-        freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
+                unfreeze_cgroup(cgroup, freezer);
+                break;
+        case CGROUP_FROZEN:
+                retval = try_to_freeze_cgroup(cgroup, freezer);
+                break;
+        default:
+                BUG();
+        }
+out:
        spin_unlock_irq(&freezer->lock);
-        /*
+        return retval;
-         * Update all its descendants in pre-order traversal.  Each
-         * descendant will try to inherit its parent's FREEZING state as
-         * CGROUP_FREEZING_PARENT.
-         */
-        rcu_read_lock();
-        cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) {
-                struct freezer *pos_f = cgroup_freezer(pos);
-                struct freezer *parent = parent_freezer(pos_f);
-                /*
-                 * Our update to @parent->state is already visible which is
-                 * all we need.  No need to lock @parent.  For more info on
-                 * synchronization, see freezer_post_create().
-                 */
-                spin_lock_irq(&pos_f->lock);
-                freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
-                                    CGROUP_FREEZING_PARENT);
-                spin_unlock_irq(&pos_f->lock);
-        }
-        rcu_read_unlock();
 }
-static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
+static int freezer_write(struct cgroup *cgroup,
+                         struct cftype *cft,
                         const char *buffer)
 {
-        bool freeze;
+        int retval;
+        enum freezer_state goal_state;
-        if (strcmp(buffer, freezer_state_strs(0)) == 0)
+        if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
-                freeze = false;
+                goal_state = CGROUP_THAWED;
-        else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
+        else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
-                freeze = true;
+                goal_state = CGROUP_FROZEN;
        else
                return -EINVAL;
-        freezer_change_state(cgroup_freezer(cgroup), freeze);
+        if (!cgroup_lock_live_group(cgroup))
-        return 0;
+                return -ENODEV;
-}
+        retval = freezer_change_state(cgroup, goal_state);
+        cgroup_unlock();
-static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft)
+        return retval;
-{
-        struct freezer *freezer = cgroup_freezer(cgroup);
-        return (bool)(freezer->state & CGROUP_FREEZING_SELF);
-}
-static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft)
-{
-        struct freezer *freezer = cgroup_freezer(cgroup);
-        return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
 }
 static struct cftype files[] = {
        {
                .name = "state",
-                .flags = CFTYPE_NOT_ON_ROOT,
                .read_seq_string = freezer_read,
                .write_string = freezer_write,
        },
-        {
-                .name = "self_freezing",
-                .flags = CFTYPE_NOT_ON_ROOT,
-                .read_u64 = freezer_self_freezing_read,
-        },
-        {
-                .name = "parent_freezing",
-                .flags = CFTYPE_NOT_ON_ROOT,
-                .read_u64 = freezer_parent_freezing_read,
-        },
-        { }     /* terminate */
 };
+static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+        if (!cgroup->parent)
+                return 0;
+        return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
+}
 struct cgroup_subsys freezer_subsys = {
        .name           = "freezer",
-        .css_alloc      = freezer_css_alloc,
+        .create         = freezer_create,
-        .css_online     = freezer_css_online,
+        .destroy        = freezer_destroy,
-        .css_offline    = freezer_css_offline,
+        .populate       = freezer_populate,
-        .css_free       = freezer_css_free,
        .subsys_id      = freezer_subsys_id,
-        .attach         = freezer_attach,
+        .can_attach     = freezer_can_attach,
+        .can_attach_task = freezer_can_attach_task,
+        .pre_attach     = NULL,
+        .attach_task    = NULL,
+        .attach         = NULL,
        .fork           = freezer_fork,
-        .base_cftypes   = files,
+        .exit           = NULL,
 };
diff --git a/kernel/compat.c b/kernel/compat.c
index f6150e92dfc..e2435ee9993 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,7 +21,6 @@
 #include <linux/unistd.h>
 #include <linux/security.h>
 #include <linux/timex.h>
-#include <linux/export.h>
 #include <linux/migrate.h>
 #include <linux/posix-timers.h>
 #include <linux/times.h>
@@ -31,10 +30,11 @@
 #include <asm/uaccess.h>
 /*
- * Get/set struct timeval with struct timespec on the native side
+ * Note that the native side is already converted to a timespec, because
+ * that's what we want anyway.
 */
-static int compat_get_timeval_convert(struct timespec *o,
+static int compat_get_timeval(struct timespec *o,
-                                      struct compat_timeval __user *i)
+                struct compat_timeval __user *i)
 {
        long usec;
@@ -45,8 +45,8 @@ static int compat_get_timeval_convert(struct timespec *o,
        return 0;
 }
-static int compat_put_timeval_convert(struct compat_timeval __user *o,
+static int compat_put_timeval(struct compat_timeval __user *o,
-                                      struct timeval *i)
+                struct timeval *i)
 {
        return (put_user(i->tv_sec, &o->tv_sec) ||
                put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
@@ -116,7 +116,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
        if (tv) {
                struct timeval ktv;
                do_gettimeofday(&ktv);
-                if (compat_put_timeval_convert(tv, &ktv))
+                if (compat_put_timeval(tv, &ktv))
                        return -EFAULT;
        }
        if (tz) {
@@ -134,7 +134,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
        struct timezone ktz;
        if (tv) {
-                if (compat_get_timeval_convert(&kts, tv))
+                if (compat_get_timeval(&kts, tv))
                        return -EFAULT;
        }
        if (tz) {
@@ -145,29 +145,12 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
        return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
 }
-int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
-{
-        return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
-                        __get_user(tv->tv_sec, &ctv->tv_sec) ||
-                        __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
-}
-EXPORT_SYMBOL_GPL(get_compat_timeval);
-int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
-{
-        return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
-                        __put_user(tv->tv_sec, &ctv->tv_sec) ||
-                        __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
-}
-EXPORT_SYMBOL_GPL(put_compat_timeval);
 int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
 {
        return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
                        __get_user(ts->tv_sec, &cts->tv_sec) ||
                        __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
-EXPORT_SYMBOL_GPL(get_compat_timespec);
 int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
 {
@@ -177,42 +160,6 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
 }
 EXPORT_SYMBOL_GPL(put_compat_timespec);
-int compat_get_timeval(struct timeval *tv, const void __user *utv)
-{
-        if (COMPAT_USE_64BIT_TIME)
-                return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0;
-        else
-                return get_compat_timeval(tv, utv);
-}
-EXPORT_SYMBOL_GPL(compat_get_timeval);
-int compat_put_timeval(const struct timeval *tv, void __user *utv)
-{
-        if (COMPAT_USE_64BIT_TIME)
-                return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0;
-        else
-                return put_compat_timeval(tv, utv);
-}
-EXPORT_SYMBOL_GPL(compat_put_timeval);
-int compat_get_timespec(struct timespec *ts, const void __user *uts)
-{
-        if (COMPAT_USE_64BIT_TIME)
-                return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0;
-        else
-                return get_compat_timespec(ts, uts);
-}
-EXPORT_SYMBOL_GPL(compat_get_timespec);
-int compat_put_timespec(const struct timespec *ts, void __user *uts)
-{
-        if (COMPAT_USE_64BIT_TIME)
-                return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0;
-        else
-                return put_compat_timespec(ts, uts);
-}
-EXPORT_SYMBOL_GPL(compat_put_timespec);
 static long compat_nanosleep_restart(struct restart_block *restart)
 {
        struct compat_timespec __user *rmtp;
@@ -372,54 +319,25 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
 #ifdef __ARCH_WANT_SYS_SIGPROCMASK
-/*
+asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
- * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
+                compat_old_sigset_t __user *oset)
- * blocked set of signals to the supplied signal set
- */
-static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
 {
-        memcpy(blocked->sig, &set, sizeof(set));
+        old_sigset_t s;
-}
+        long ret;
+        mm_segment_t old_fs;
-asmlinkage long compat_sys_sigprocmask(int how,
-                                       compat_old_sigset_t __user *nset,
-                                       compat_old_sigset_t __user *oset)
-{
-        old_sigset_t old_set, new_set;
-        sigset_t new_blocked;
-        old_set = current->blocked.sig[0];
-        if (nset) {
-                if (get_user(new_set, nset))
-                        return -EFAULT;
-                new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
-                new_blocked = current->blocked;
-                switch (how) {
-                case SIG_BLOCK:
-                        sigaddsetmask(&new_blocked, new_set);
-                        break;
-                case SIG_UNBLOCK:
-                        sigdelsetmask(&new_blocked, new_set);
-                        break;
-                case SIG_SETMASK:
-                        compat_sig_setmask(&new_blocked, new_set);
-                        break;
-                default:
-                        return -EINVAL;
-                }
-                set_current_blocked(&new_blocked);
-        }
-        if (oset) {
-                if (put_user(old_set, oset))
-                        return -EFAULT;
-        }
-        return 0;
+        if (set && get_user(s, set))
+                return -EFAULT;
+        old_fs = get_fs();
+        set_fs(KERNEL_DS);
+        ret = sys_sigprocmask(how,
+                              set ? (old_sigset_t __user *) &s : NULL,
+                              oset ? (old_sigset_t __user *) &s : NULL);
+        set_fs(old_fs);
+        if (ret == 0)
+                if (oset)
+                        ret = put_user(s, oset);
+        return ret;
 }
 #endif
@@ -1073,7 +991,15 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
        if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
                return -EFAULT;
        sigset_from_compat(&newset, &newset32);
-        return sigsuspend(&newset);
+        sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+        current->saved_sigmask = current->blocked;
+        set_current_blocked(&newset);
+        current->state = TASK_INTERRUPTIBLE;
+        schedule();
+        set_restore_sigmask();
+        return -ERESTARTNOHAND;
 }
 #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
@@ -1215,23 +1141,6 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
        return 0;
 }
-#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
-asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
-                                                 struct compat_timespec __user *interval)
-{
-        struct timespec t;
-        int ret;
-        mm_segment_t old_fs = get_fs();
-        set_fs(KERNEL_DS);
-        ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
-        set_fs(old_fs);
-        if (put_compat_timespec(&t, interval))
-                return -EFAULT;
-        return ret;
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */
 /*
 * Allocate user-space memory for the duration of a single system call,
 * in order to marshall parameters inside a compat thunk.
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
deleted file mode 100644
index e0e07fd5550..00000000000
--- a/kernel/context_tracking.c
+++ /dev/null
@@ -1,83 +0,0 @@
-#include <linux/context_tracking.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-struct context_tracking {
-        /*
-         * When active is false, hooks are not set to
-         * minimize overhead: TIF flags are cleared
-         * and calls to user_enter/exit are ignored. This
-         * may be further optimized using static keys.
-         */
-        bool active;
-        enum {
-                IN_KERNEL = 0,
-                IN_USER,
-        } state;
-};
-static DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
-#ifdef CONFIG_CONTEXT_TRACKING_FORCE
-        .active = true,
-#endif
-};
-void user_enter(void)
-{
-        unsigned long flags;
-        /*
-         * Some contexts may involve an exception occuring in an irq,
-         * leading to that nesting:
-         * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
-         * This would mess up the dyntick_nesting count though. And rcu_irq_*()
-         * helpers are enough to protect RCU uses inside the exception. So
-         * just return immediately if we detect we are in an IRQ.
-         */
-        if (in_interrupt())
-                return;
-        WARN_ON_ONCE(!current->mm);
-        local_irq_save(flags);
-        if (__this_cpu_read(context_tracking.active) &&
-            __this_cpu_read(context_tracking.state) != IN_USER) {
-                __this_cpu_write(context_tracking.state, IN_USER);
-                rcu_user_enter();
-        }
-        local_irq_restore(flags);
-}
-void user_exit(void)
-{
-        unsigned long flags;
-        /*
-         * Some contexts may involve an exception occuring in an irq,
-         * leading to that nesting:
-         * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
-         * This would mess up the dyntick_nesting count though. And rcu_irq_*()
-         * helpers are enough to protect RCU uses inside the exception. So
-         * just return immediately if we detect we are in an IRQ.
-         */
-        if (in_interrupt())
-                return;
-        local_irq_save(flags);
-        if (__this_cpu_read(context_tracking.state) == IN_USER) {
-                __this_cpu_write(context_tracking.state, IN_KERNEL);
-                rcu_user_exit();
-        }
-        local_irq_restore(flags);
-}
-void context_tracking_task_switch(struct task_struct *prev,
-                             struct task_struct *next)
-{
-        if (__this_cpu_read(context_tracking.active)) {
-                clear_tsk_thread_flag(prev, TIF_NOHZ);
-                set_tsk_thread_flag(next, TIF_NOHZ);
-        }
-}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3046a503242..eae3d9b3957 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,18 +10,13 @@
 #include <linux/sched.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
-#include <linux/oom.h>
+#include <linux/module.h>
-#include <linux/rcupdate.h>
-#include <linux/export.h>
-#include <linux/bug.h>
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 #include <linux/gfp.h>
 #include <linux/suspend.h>
-#include "smpboot.h"
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -80,10 +75,6 @@ void put_online_cpus(void)
        if (cpu_hotplug.active_writer == current)
                return;
        mutex_lock(&cpu_hotplug.lock);
-        if (WARN_ON(!cpu_hotplug.refcount))
-                cpu_hotplug.refcount++; /* try to fix things up */
        if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
                wake_up_process(cpu_hotplug.active_writer);
        mutex_unlock(&cpu_hotplug.lock);
@@ -180,47 +171,6 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
-/**
- * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
- * @cpu: a CPU id
- *
- * This function walks all processes, finds a valid mm struct for each one and
- * then clears a corresponding bit in mm's cpumask.  While this all sounds
- * trivial, there are various non-obvious corner cases, which this function
- * tries to solve in a safe manner.
- *
- * Also note that the function uses a somewhat relaxed locking scheme, so it may
- * be called only for an already offlined CPU.
- */
-void clear_tasks_mm_cpumask(int cpu)
-{
-        struct task_struct *p;
-        /*
-         * This function is called after the cpu is taken down and marked
-         * offline, so its not like new tasks will ever get this cpu set in
-         * their mm mask. -- Peter Zijlstra
-         * Thus, we may use rcu_read_lock() here, instead of grabbing
-         * full-fledged tasklist_lock.
-         */
-        WARN_ON(cpu_online(cpu));
-        rcu_read_lock();
-        for_each_process(p) {
-                struct task_struct *t;
-                /*
-                 * Main thread might exit, but other threads may still have
-                 * a valid mm. Find one.
-                 */
-                t = find_lock_task_mm(p);
-                if (!t)
-                        continue;
-                cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
-                task_unlock(t);
-        }
-        rcu_read_unlock();
-}
 static inline void check_for_tasks(int cpu)
 {
        struct task_struct *p;
@@ -228,7 +178,8 @@ static inline void check_for_tasks(int cpu)
        write_lock_irq(&tasklist_lock);
        for_each_process(p) {
                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
-                    (p->utime || p->stime))
+                    (!cputime_eq(p->utime, cputime_zero) ||
+                     !cputime_eq(p->stime, cputime_zero)))
                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
                                "(state = %ld, flags = %x)\n",
                                p->comm, task_pid_nr(p), cpu,
@@ -284,13 +235,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                                __func__, cpu);
                goto out_release;
        }
-        smpboot_park_threads(cpu);
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
                /* CPU didn't die: tell everyone.  Can't complain. */
-                smpboot_unpark_threads(cpu);
                cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
                goto out_release;
        }
        BUG_ON(cpu_online(cpu));
@@ -346,25 +296,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
        int ret, nr_calls = 0;
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
-        struct task_struct *idle;
-        cpu_hotplug_begin();
-        if (cpu_online(cpu) || !cpu_present(cpu)) {
-                ret = -EINVAL;
-                goto out;
-        }
-        idle = idle_thread_get(cpu);
-        if (IS_ERR(idle)) {
-                ret = PTR_ERR(idle);
-                goto out;
-        }
-        ret = smpboot_create_threads(cpu);
+        if (cpu_online(cpu) || !cpu_present(cpu))
-        if (ret)
+                return -EINVAL;
-                goto out;
+        cpu_hotplug_begin();
        ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
        if (ret) {
                nr_calls--;
@@ -374,21 +310,17 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
        }
        /* Arch-specific enabling code. */
-        ret = __cpu_up(cpu, idle);
+        ret = __cpu_up(cpu);
        if (ret != 0)
                goto out_notify;
        BUG_ON(!cpu_online(cpu));
-        /* Wake the per cpu threads */
-        smpboot_unpark_threads(cpu);
        /* Now call notifier in preparation. */
        cpu_notify(CPU_ONLINE | mod, hcpu);
 out_notify:
        if (ret != 0)
                __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
-out:
        cpu_hotplug_done();
        return ret;
@@ -430,7 +362,7 @@ int __cpuinit cpu_up(unsigned int cpu)
        if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
                mutex_lock(&zonelists_mutex);
-                build_all_zonelists(NULL, NULL);
+                build_all_zonelists(NULL);
                mutex_unlock(&zonelists_mutex);
        }
 #endif
@@ -448,11 +380,18 @@ out:
        cpu_maps_update_done();
        return err;
 }
-EXPORT_SYMBOL_GPL(cpu_up);
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;
+void __weak arch_disable_nonboot_cpus_begin(void)
+{
+}
+void __weak arch_disable_nonboot_cpus_end(void)
+{
+}
 int disable_nonboot_cpus(void)
 {
        int cpu, first_cpu, error = 0;
@@ -464,6 +403,7 @@ int disable_nonboot_cpus(void)
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);
+        arch_disable_nonboot_cpus_begin();
        printk("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
@@ -479,6 +419,8 @@ int disable_nonboot_cpus(void)
                }
        }
+        arch_disable_nonboot_cpus_end();
        if (!error) {
                BUG_ON(num_online_cpus() > 1);
                /* Make sure the CPUs won't be enabled by someone else */
@@ -528,7 +470,7 @@ out:
        cpu_maps_update_done();
 }
-static int __init alloc_frozen_cpus(void)
+static int alloc_frozen_cpus(void)
 {
        if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
                return -ENOMEM;
@@ -601,13 +543,8 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
 }
-static int __init cpu_hotplug_pm_sync_init(void)
+int cpu_hotplug_pm_sync_init(void)
 {
-        /*
-         * cpu_hotplug_pm_callback has higher priority than x86
-         * bsp_pm_callback which depends on cpu_hotplug_pm_callback
-         * to disable cpu hotplug to avoid cpu hotplug race.
-         */
        pm_notifier(cpu_hotplug_pm_callback, 0);
        return 0;
 }
@@ -731,3 +668,23 @@ void init_cpu_online(const struct cpumask *src)
 {
        cpumask_copy(to_cpumask(cpu_online_bits), src);
 }
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+void idle_notifier_register(struct notifier_block *n)
+{
+        atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+void idle_notifier_unregister(struct notifier_block *n)
+{
+        atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
+void idle_notifier_call_chain(unsigned long val)
+{
+        atomic_notifier_call_chain(&idle_notifier, val, NULL);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_call_chain);
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
deleted file mode 100644
index 9656a3c3650..00000000000
--- a/kernel/cpu_pm.c
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (C) 2011 Google, Inc.
- *
- * Author:
- *      Colin Cross <ccross@android.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-#include <linux/kernel.h>
-#include <linux/cpu_pm.h>
-#include <linux/module.h>
-#include <linux/notifier.h>
-#include <linux/spinlock.h>
-#include <linux/syscore_ops.h>
-static DEFINE_RWLOCK(cpu_pm_notifier_lock);
-static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
-static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
-{
-        int ret;
-        ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
-                nr_to_call, nr_calls);
-        return notifier_to_errno(ret);
-}
-/**
- * cpu_pm_register_notifier - register a driver with cpu_pm
- * @nb: notifier block to register
- *
- * Add a driver to a list of drivers that are notified about
- * CPU and CPU cluster low power entry and exit.
- *
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_register.
- */
-int cpu_pm_register_notifier(struct notifier_block *nb)
-{
-        unsigned long flags;
-        int ret;
-        write_lock_irqsave(&cpu_pm_notifier_lock, flags);
-        ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
-        write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
-/**
- * cpu_pm_unregister_notifier - unregister a driver with cpu_pm
- * @nb: notifier block to be unregistered
- *
- * Remove a driver from the CPU PM notifier list.
- *
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_unregister.
- */
-int cpu_pm_unregister_notifier(struct notifier_block *nb)
-{
-        unsigned long flags;
-        int ret;
-        write_lock_irqsave(&cpu_pm_notifier_lock, flags);
-        ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
-        write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
-/**
- * cpu_pm_enter - CPU low power entry notifier
- *
- * Notifies listeners that a single CPU is entering a low power state that may
- * cause some blocks in the same power domain as the cpu to reset.
- *
- * Must be called on the affected CPU with interrupts disabled.  Platform is
- * responsible for ensuring that cpu_pm_enter is not called twice on the same
- * CPU before cpu_pm_exit is called. Notified drivers can include VFP
- * co-processor, interrupt controller and its PM extensions, local CPU
- * timers context save/restore which shouldn't be interrupted. Hence it
- * must be called with interrupts disabled.
- *
- * Return conditions are same as __raw_notifier_call_chain.
- */
-int cpu_pm_enter(void)
-{
-        int nr_calls;
-        int ret = 0;
-        read_lock(&cpu_pm_notifier_lock);
-        ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
-        if (ret)
-                /*
-                 * Inform listeners (nr_calls - 1) about failure of CPU PM
-                 * PM entry who are notified earlier to prepare for it.
-                 */
-                cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
-        read_unlock(&cpu_pm_notifier_lock);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(cpu_pm_enter);
-/**
- * cpu_pm_exit - CPU low power exit notifier
- *
- * Notifies listeners that a single CPU is exiting a low power state that may
- * have caused some blocks in the same power domain as the cpu to reset.
- *
- * Notified drivers can include VFP co-processor, interrupt controller
- * and its PM extensions, local CPU timers context save/restore which
- * shouldn't be interrupted. Hence it must be called with interrupts disabled.
- *
- * Return conditions are same as __raw_notifier_call_chain.
- */
-int cpu_pm_exit(void)
-{
-        int ret;
-        read_lock(&cpu_pm_notifier_lock);
-        ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
-        read_unlock(&cpu_pm_notifier_lock);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(cpu_pm_exit);
-/**
- * cpu_cluster_pm_enter - CPU cluster low power entry notifier
- *
- * Notifies listeners that all cpus in a power domain are entering a low power
- * state that may cause some blocks in the same power domain to reset.
- *
- * Must be called after cpu_pm_enter has been called on all cpus in the power
- * domain, and before cpu_pm_exit has been called on any cpu in the power
- * domain. Notified drivers can include VFP co-processor, interrupt controller
- * and its PM extensions, local CPU timers context save/restore which
- * shouldn't be interrupted. Hence it must be called with interrupts disabled.
- *
- * Must be called with interrupts disabled.
- *
- * Return conditions are same as __raw_notifier_call_chain.
- */
-int cpu_cluster_pm_enter(void)
-{
-        int nr_calls;
-        int ret = 0;
-        read_lock(&cpu_pm_notifier_lock);
-        ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
-        if (ret)
-                /*
-                 * Inform listeners (nr_calls - 1) about failure of CPU cluster
-                 * PM entry who are notified earlier to prepare for it.
-                 */
-                cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
-        read_unlock(&cpu_pm_notifier_lock);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
-/**
- * cpu_cluster_pm_exit - CPU cluster low power exit notifier
- *
- * Notifies listeners that all cpus in a power domain are exiting form a
- * low power state that may have caused some blocks in the same power domain
- * to reset.
- *
- * Must be called after cpu_pm_exit has been called on all cpus in the power
- * domain, and before cpu_pm_exit has been called on any cpu in the power
- * domain. Notified drivers can include VFP co-processor, interrupt controller
- * and its PM extensions, local CPU timers context save/restore which
- * shouldn't be interrupted. Hence it must be called with interrupts disabled.
- *
- * Return conditions are same as __raw_notifier_call_chain.
- */
-int cpu_cluster_pm_exit(void)
-{
-        int ret;
-        read_lock(&cpu_pm_notifier_lock);
-        ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
-        read_unlock(&cpu_pm_notifier_lock);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
-#ifdef CONFIG_PM
-static int cpu_pm_suspend(void)
-{
-        int ret;
-        ret = cpu_pm_enter();
-        if (ret)
-                return ret;
-        ret = cpu_cluster_pm_enter();
-        return ret;
-}
-static void cpu_pm_resume(void)
-{
-        cpu_cluster_pm_exit();
-        cpu_pm_exit();
-}
-static struct syscore_ops cpu_pm_syscore_ops = {
-        .suspend = cpu_pm_suspend,
-        .resume = cpu_pm_resume,
-};
-static int cpu_pm_init(void)
-{
-        register_syscore_ops(&cpu_pm_syscore_ops);
-        return 0;
-}
-core_initcall(cpu_pm_init);
-#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7bb63eea6eb..10131fdaff7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -37,7 +37,7 @@
 #include <linux/mempolicy.h>
 #include <linux/mm.h>
 #include <linux/memory.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
@@ -123,19 +123,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
                            struct cpuset, css);
 }
-#ifdef CONFIG_NUMA
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
-        return task->mempolicy;
-}
-#else
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
-        return false;
-}
-#endif
 /* bits in struct cpuset flags field */
 typedef enum {
        CS_CPU_EXCLUSIVE,
@@ -147,12 +134,6 @@ typedef enum {
        CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
-/* the type of hotplug event */
-enum hotplug_event {
-        CPUSET_CPU_OFFLINE,
-        CPUSET_MEM_OFFLINE,
-};
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
@@ -276,11 +257,11 @@ static struct file_system_type cpuset_fs_type = {
 * are online.  If none are online, walk up the cpuset hierarchy
 * until we find one that does have some online cpus.  If we get
 * all the way to the top and still haven't found any online cpus,
- * return cpu_online_mask.  Or if passed a NULL cs from an exit'ing
+ * return cpu_online_map.  Or if passed a NULL cs from an exit'ing
- * task, return cpu_online_mask.
+ * task, return cpu_online_map.
 *
 * One way or another, we guarantee to return some non-empty subset
- * of cpu_online_mask.
+ * of cpu_online_map.
 *
 * Call with callback_mutex held.
 */
@@ -302,10 +283,10 @@ static void guarantee_online_cpus(const struct cpuset *cs,
 * are online, with memory.  If none are online with memory, walk
 * up the cpuset hierarchy until we find one that does have some
 * online mems.  If we get all the way to the top and still haven't
- * found any online mems, return node_states[N_MEMORY].
+ * found any online mems, return node_states[N_HIGH_MEMORY].
 *
 * One way or another, we guarantee to return some non-empty subset
- * of node_states[N_MEMORY].
+ * of node_states[N_HIGH_MEMORY].
 *
 * Call with callback_mutex held.
 */
@@ -313,14 +294,14 @@ static void guarantee_online_cpus(const struct cpuset *cs,
 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 {
        while (cs && !nodes_intersects(cs->mems_allowed,
-                                        node_states[N_MEMORY]))
+                                        node_states[N_HIGH_MEMORY]))
                cs = cs->parent;
        if (cs)
                nodes_and(*pmask, cs->mems_allowed,
-                                        node_states[N_MEMORY]);
+                                        node_states[N_HIGH_MEMORY]);
        else
-                *pmask = node_states[N_MEMORY];
+                *pmask = node_states[N_HIGH_MEMORY];
-        BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
+        BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
 }
 /*
@@ -873,7 +854,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
        int retval;
        int is_load_balanced;
-        /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
+        /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
        if (cs == &top_cpuset)
                return -EACCES;
@@ -968,8 +949,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
                                        nodemask_t *newmems)
 {
-        bool need_loop;
+repeat:
        /*
         * Allow tasks that have access to memory reserves because they have
         * been OOM killed to get memory anywhere.
@@ -980,27 +960,46 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
                return;
        task_lock(tsk);
+        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
        /*
-         * Determine if a loop is necessary if another thread is doing
+         * ensure checking ->mems_allowed_change_disable after setting all new
-         * get_mems_allowed().  If at least one node remains unchanged and
+         * allowed nodes.
-         * tsk does not have a mempolicy, then an empty nodemask will not be
+         *
-         * possible when mems_allowed is larger than a word.
+         * the read-side task can see an nodemask with new allowed nodes and
+         * old allowed nodes. and if it allocates page when cpuset clears newly
+         * disallowed ones continuous, it can see the new allowed bits.
+         *
+         * And if setting all new allowed nodes is after the checking, setting
+         * all new allowed nodes and clearing newly disallowed ones will be done
+         * continuous, and the read-side task may find no node to alloc page.
         */
-        need_loop = task_has_mempolicy(tsk) ||
+        smp_mb();
-                        !nodes_intersects(*newmems, tsk->mems_allowed);
-        if (need_loop)
+        /*
-                write_seqcount_begin(&tsk->mems_allowed_seq);
+         * Allocation of memory is very fast, we needn't sleep when waiting
+         * for the read-side.
+         */
+        while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+                task_unlock(tsk);
+                if (!task_curr(tsk))
+                        yield();
+                goto repeat;
+        }
-        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+        /*
-        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
+         * ensure checking ->mems_allowed_change_disable before clearing all new
+         * disallowed nodes.
+         *
+         * if clearing newly disallowed bits before the checking, the read-side
+         * task may find no node to alloc page.
+         */
+        smp_mb();
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
        tsk->mems_allowed = *newmems;
-        if (need_loop)
-                write_seqcount_end(&tsk->mems_allowed_seq);
        task_unlock(tsk);
 }
@@ -1100,7 +1099,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                return -ENOMEM;
        /*
-         * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
+         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
         * it's read-only
         */
        if (cs == &top_cpuset) {
@@ -1122,7 +1121,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                        goto done;
                if (!nodes_subset(trialcs->mems_allowed,
-                                node_states[N_MEMORY])) {
+                                node_states[N_HIGH_MEMORY])) {
                        retval =  -EINVAL;
                        goto done;
                }
@@ -1368,71 +1367,79 @@ static int fmeter_getrate(struct fmeter *fmp)
        return val;
 }
+/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
+static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+                             struct task_struct *tsk)
+{
+        struct cpuset *cs = cgroup_cs(cont);
+        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+                return -ENOSPC;
+        /*
+         * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
+         * cannot change their cpu affinity and isolating such threads by their
+         * set of allowed nodes is unnecessary.  Thus, cpusets are not
+         * applicable for such threads.  This prevents checking for success of
+         * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
+         * be changed.
+         */
+        if (tsk->flags & PF_THREAD_BOUND)
+                return -EINVAL;
+        return 0;
+}
+static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
+{
+        return security_task_setscheduler(task);
+}
 /*
 * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in can_attach, and they must
+ * dynamically allocating them is not allowed in pre_attach, and they must
- * persist until attach.
+ * persist among pre_attach, attach_task, and attach.
 */
 static cpumask_var_t cpus_attach;
 static nodemask_t cpuset_attach_nodemask_from;
 static nodemask_t cpuset_attach_nodemask_to;
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
+/* Set-up work for before attaching each task. */
-static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+static void cpuset_pre_attach(struct cgroup *cont)
 {
-        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *cs = cgroup_cs(cont);
-        struct task_struct *task;
-        int ret;
-        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
-                return -ENOSPC;
-        cgroup_taskset_for_each(task, cgrp, tset) {
-                /*
-                 * Kthreads bound to specific cpus cannot be moved to a new
-                 * cpuset; we cannot change their cpu affinity and
-                 * isolating such threads by their set of allowed nodes is
-                 * unnecessary.  Thus, cpusets are not applicable for such
-                 * threads.  This prevents checking for success of
-                 * set_cpus_allowed_ptr() on all attached tasks before
-                 * cpus_allowed may be changed.
-                 */
-                if (task->flags & PF_THREAD_BOUND)
-                        return -EINVAL;
-                if ((ret = security_task_setscheduler(task)))
-                        return ret;
-        }
-        /* prepare for attach */
        if (cs == &top_cpuset)
                cpumask_copy(cpus_attach, cpu_possible_mask);
        else
                guarantee_online_cpus(cs, cpus_attach);
        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
-        return 0;
 }
-static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+/* Per-thread attachment work. */
+static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
 {
-        struct mm_struct *mm;
+        int err;
-        struct task_struct *task;
+        struct cpuset *cs = cgroup_cs(cont);
-        struct task_struct *leader = cgroup_taskset_first(tset);
-        struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
-        struct cpuset *cs = cgroup_cs(cgrp);
-        struct cpuset *oldcs = cgroup_cs(oldcgrp);
-        cgroup_taskset_for_each(task, cgrp, tset) {
+        /*
-                /*
+         * can_attach beforehand should guarantee that this doesn't fail.
-                 * can_attach beforehand should guarantee that this doesn't
+         * TODO: have a better way to handle failure here
-                 * fail.  TODO: have a better way to handle failure here
+         */
-                 */
+        err = set_cpus_allowed_ptr(tsk, cpus_attach);
-                WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+        WARN_ON_ONCE(err);
-                cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+        cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
-                cpuset_update_task_spread_flag(cs, task);
+        cpuset_update_task_spread_flag(cs, tsk);
-        }
+}
+static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+                          struct cgroup *oldcont, struct task_struct *tsk)
+{
+        struct mm_struct *mm;
+        struct cpuset *cs = cgroup_cs(cont);
+        struct cpuset *oldcs = cgroup_cs(oldcont);
        /*
         * Change mm, possibly for multiple threads in a threadgroup. This is
@@ -1440,7 +1447,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
         */
        cpuset_attach_nodemask_from = oldcs->mems_allowed;
        cpuset_attach_nodemask_to = cs->mems_allowed;
-        mm = get_task_mm(leader);
+        mm = get_task_mm(tsk);
        if (mm) {
                mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
                if (is_memory_migrate(cs))
@@ -1771,33 +1778,84 @@ static struct cftype files[] = {
                .write_u64 = cpuset_write_u64,
                .private = FILE_SPREAD_SLAB,
        },
+};
-        {
+static struct cftype cft_memory_pressure_enabled = {
-                .name = "memory_pressure_enabled",
+        .name = "memory_pressure_enabled",
-                .flags = CFTYPE_ONLY_ON_ROOT,
+        .read_u64 = cpuset_read_u64,
-                .read_u64 = cpuset_read_u64,
+        .write_u64 = cpuset_write_u64,
-                .write_u64 = cpuset_write_u64,
+        .private = FILE_MEMORY_PRESSURE_ENABLED,
-                .private = FILE_MEMORY_PRESSURE_ENABLED,
-        },
-        { }     /* terminate */
 };
+static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+        int err;
+        err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
+        if (err)
+                return err;
+        /* memory_pressure_enabled is in root cpuset only */
+        if (!cont->parent)
+                err = cgroup_add_file(cont, ss,
+                                      &cft_memory_pressure_enabled);
+        return err;
+}
 /*
- *      cpuset_css_alloc - allocate a cpuset css
+ * post_clone() is called during cgroup_create() when the
+ * clone_children mount argument was specified.  The cgroup
+ * can not yet have any tasks.
+ *
+ * Currently we refuse to set up the cgroup - thereby
+ * refusing the task to be entered, and as a result refusing
+ * the sys_unshare() or clone() which initiated it - if any
+ * sibling cpusets have exclusive cpus or mem.
+ *
+ * If this becomes a problem for some users who wish to
+ * allow that scenario, then cpuset_post_clone() could be
+ * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+ * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
+ * held.
+ */
+static void cpuset_post_clone(struct cgroup_subsys *ss,
+                              struct cgroup *cgroup)
+{
+        struct cgroup *parent, *child;
+        struct cpuset *cs, *parent_cs;
+        parent = cgroup->parent;
+        list_for_each_entry(child, &parent->children, sibling) {
+                cs = cgroup_cs(child);
+                if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
+                        return;
+        }
+        cs = cgroup_cs(cgroup);
+        parent_cs = cgroup_cs(parent);
+        mutex_lock(&callback_mutex);
+        cs->mems_allowed = parent_cs->mems_allowed;
+        cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
+        mutex_unlock(&callback_mutex);
+        return;
+}
+/*
+ *      cpuset_create - create a cpuset
+ *      ss:     cpuset cgroup subsystem
 *      cont:   control group that the new cpuset will be part of
 */
-static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
+static struct cgroup_subsys_state *cpuset_create(
+        struct cgroup_subsys *ss,
+        struct cgroup *cont)
 {
-        struct cgroup *parent_cg = cont->parent;
+        struct cpuset *cs;
-        struct cgroup *tmp_cg;
+        struct cpuset *parent;
-        struct cpuset *parent, *cs;
-        if (!parent_cg)
+        if (!cont->parent) {
                return &top_cpuset.css;
-        parent = cgroup_cs(parent_cg);
+        }
+        parent = cgroup_cs(cont->parent);
        cs = kmalloc(sizeof(*cs), GFP_KERNEL);
        if (!cs)
                return ERR_PTR(-ENOMEM);
@@ -1819,36 +1877,7 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
        cs->parent = parent;
        number_of_cpusets++;
+        return &cs->css ;
-        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
-                goto skip_clone;
-        /*
-         * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
-         * set.  This flag handling is implemented in cgroup core for
-         * histrical reasons - the flag may be specified during mount.
-         *
-         * Currently, if any sibling cpusets have exclusive cpus or mem, we
-         * refuse to clone the configuration - thereby refusing the task to
-         * be entered, and as a result refusing the sys_unshare() or
-         * clone() which initiated it.  If this becomes a problem for some
-         * users who wish to allow that scenario, then this could be
-         * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
-         * (and likewise for mems) to the new cgroup.
-         */
-        list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
-                struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
-                if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
-                        goto skip_clone;
-        }
-        mutex_lock(&callback_mutex);
-        cs->mems_allowed = parent->mems_allowed;
-        cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
-        mutex_unlock(&callback_mutex);
-skip_clone:
-        return &cs->css;
 }
 /*
@@ -1857,7 +1886,7 @@ skip_clone:
 * will call async_rebuild_sched_domains().
 */
-static void cpuset_css_free(struct cgroup *cont)
+static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 {
        struct cpuset *cs = cgroup_cs(cont);
@@ -1871,12 +1900,16 @@ static void cpuset_css_free(struct cgroup *cont)
 struct cgroup_subsys cpuset_subsys = {
        .name = "cpuset",
-        .css_alloc = cpuset_css_alloc,
+        .create = cpuset_create,
-        .css_free = cpuset_css_free,
+        .destroy = cpuset_destroy,
        .can_attach = cpuset_can_attach,
+        .can_attach_task = cpuset_can_attach_task,
+        .pre_attach = cpuset_pre_attach,
+        .attach_task = cpuset_attach_task,
        .attach = cpuset_attach,
+        .populate = cpuset_populate,
+        .post_clone = cpuset_post_clone,
        .subsys_id = cpuset_subsys_id,
-        .base_cftypes = files,
        .early_init = 1,
 };
@@ -1988,36 +2021,8 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 }
 /*
- * Helper function to traverse cpusets.
+ * Walk the specified cpuset subtree and look for empty cpusets.
- * It can be used to walk the cpuset tree from top to bottom, completing
+ * The tasks of such cpuset must be moved to a parent cpuset.
- * one layer before dropping down to the next (thus always processing a
- * node before any of its children).
- */
-static struct cpuset *cpuset_next(struct list_head *queue)
-{
-        struct cpuset *cp;
-        struct cpuset *child;   /* scans child cpusets of cp */
-        struct cgroup *cont;
-        if (list_empty(queue))
-                return NULL;
-        cp = list_first_entry(queue, struct cpuset, stack_list);
-        list_del(queue->next);
-        list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
-                child = cgroup_cs(cont);
-                list_add_tail(&child->stack_list, queue);
-        }
-        return cp;
-}
-/*
- * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
- * online/offline) and update the cpusets accordingly.
- * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
- * cpuset must be moved to a parent cpuset.
 *
 * Called with cgroup_mutex held.  We take callback_mutex to modify
 * cpus_allowed and mems_allowed.
@@ -2026,61 +2031,50 @@ static struct cpuset *cpuset_next(struct list_head *queue)
 * before dropping down to the next.  It always processes a node before
 * any of its children.
 *
- * In the case of memory hot-unplug, it will remove nodes from N_MEMORY
+ * For now, since we lack memory hot unplug, we'll never see a cpuset
- * if all present pages from a node are offlined.
+ * that has tasks along with an empty 'mems'.  But if we did see such
+ * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
 */
-static void
+static void scan_for_empty_cpusets(struct cpuset *root)
-scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
 {
        LIST_HEAD(queue);
-        struct cpuset *cp;              /* scans cpusets being updated */
+        struct cpuset *cp;      /* scans cpusets being updated */
+        struct cpuset *child;   /* scans child cpusets of cp */
+        struct cgroup *cont;
        static nodemask_t oldmems;      /* protected by cgroup_mutex */
        list_add_tail((struct list_head *)&root->stack_list, &queue);
-        switch (event) {
+        while (!list_empty(&queue)) {
-        case CPUSET_CPU_OFFLINE:
+                cp = list_first_entry(&queue, struct cpuset, stack_list);
-                while ((cp = cpuset_next(&queue)) != NULL) {
+                list_del(queue.next);
+                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
-                        /* Continue past cpusets with all cpus online */
+                        child = cgroup_cs(cont);
-                        if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
+                        list_add_tail(&child->stack_list, &queue);
-                                continue;
-                        /* Remove offline cpus from this cpuset. */
-                        mutex_lock(&callback_mutex);
-                        cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
-                                                        cpu_active_mask);
-                        mutex_unlock(&callback_mutex);
-                        /* Move tasks from the empty cpuset to a parent */
-                        if (cpumask_empty(cp->cpus_allowed))
-                                remove_tasks_in_empty_cpuset(cp);
-                        else
-                                update_tasks_cpumask(cp, NULL);
                }
-                break;
-        case CPUSET_MEM_OFFLINE:
+                /* Continue past cpusets with all cpus, mems online */
-                while ((cp = cpuset_next(&queue)) != NULL) {
+                if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
+                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
-                        /* Continue past cpusets with all mems online */
+                        continue;
-                        if (nodes_subset(cp->mems_allowed,
-                                        node_states[N_MEMORY]))
-                                continue;
-                        oldmems = cp->mems_allowed;
+                oldmems = cp->mems_allowed;
-                        /* Remove offline mems from this cpuset. */
+                /* Remove offline cpus and mems from this cpuset. */
-                        mutex_lock(&callback_mutex);
+                mutex_lock(&callback_mutex);
-                        nodes_and(cp->mems_allowed, cp->mems_allowed,
+                cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
-                                                node_states[N_MEMORY]);
+                            cpu_active_mask);
-                        mutex_unlock(&callback_mutex);
+                nodes_and(cp->mems_allowed, cp->mems_allowed,
+                                                node_states[N_HIGH_MEMORY]);
+                mutex_unlock(&callback_mutex);
-                        /* Move tasks from the empty cpuset to a parent */
+                /* Move tasks from the empty cpuset to a parent */
-                        if (nodes_empty(cp->mems_allowed))
+                if (cpumask_empty(cp->cpus_allowed) ||
-                                remove_tasks_in_empty_cpuset(cp);
+                     nodes_empty(cp->mems_allowed))
-                        else
+                        remove_tasks_in_empty_cpuset(cp);
-                                update_tasks_nodemask(cp, &oldmems, NULL);
+                else {
+                        update_tasks_cpumask(cp, NULL);
+                        update_tasks_nodemask(cp, &oldmems, NULL);
                }
        }
 }
@@ -2091,19 +2085,13 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
 * (of no affect) on systems that are actively using CPU hotplug
 * but making no active use of cpusets.
 *
- * The only exception to this is suspend/resume, where we don't
- * modify cpusets at all.
- *
 * This routine ensures that top_cpuset.cpus_allowed tracks
 * cpu_active_mask on each CPU hotplug (cpuhp) event.
 *
 * Called within get_online_cpus().  Needs to call cgroup_lock()
 * before calling generate_sched_domains().
- *
- * @cpu_online: Indicates whether this is a CPU online event (true) or
- * a CPU offline event (false).
 */
-void cpuset_update_active_cpus(bool cpu_online)
+void cpuset_update_active_cpus(void)
 {
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
@@ -2113,10 +2101,7 @@ void cpuset_update_active_cpus(bool cpu_online)
        mutex_lock(&callback_mutex);
        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        mutex_unlock(&callback_mutex);
+        scan_for_empty_cpusets(&top_cpuset);
-        if (!cpu_online)
-                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
        ndoms = generate_sched_domains(&doms, &attr);
        cgroup_unlock();
@@ -2126,9 +2111,9 @@ void cpuset_update_active_cpus(bool cpu_online)
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
- * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
+ * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
- * Call this routine anytime after node_states[N_MEMORY] changes.
+ * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * See cpuset_update_active_cpus() for CPU hotplug handling.
+ * See also the previous routine cpuset_track_online_cpus().
 */
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
@@ -2140,16 +2125,16 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
        case MEM_ONLINE:
                oldmems = top_cpuset.mems_allowed;
                mutex_lock(&callback_mutex);
-                top_cpuset.mems_allowed = node_states[N_MEMORY];
+                top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
                mutex_unlock(&callback_mutex);
                update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
                break;
        case MEM_OFFLINE:
                /*
                 * needn't update top_cpuset.mems_allowed explicitly because
-                 * scan_cpusets_upon_hotplug() will update it.
+                 * scan_for_empty_cpusets() will update it.
                 */
-                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
+                scan_for_empty_cpusets(&top_cpuset);
                break;
        default:
                break;
@@ -2169,7 +2154,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 void __init cpuset_init_smp(void)
 {
        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
-        top_cpuset.mems_allowed = node_states[N_MEMORY];
+        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
        hotplug_memory_notifier(cpuset_track_online_nodes, 10);
@@ -2184,7 +2169,7 @@ void __init cpuset_init_smp(void)
 *
 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
- * subset of cpu_online_mask, even if this means going outside the
+ * subset of cpu_online_map, even if this means going outside the
 * tasks cpuset.
 **/
@@ -2197,9 +2182,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
        mutex_unlock(&callback_mutex);
 }
-void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
        const struct cpuset *cs;
+        int cpu;
        rcu_read_lock();
        cs = task_cs(tsk);
@@ -2220,10 +2206,22 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
         * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
         * set any mask even if it is not right from task_cs() pov,
         * the pending set_cpus_allowed_ptr() will fix things.
-         *
-         * select_fallback_rq() will fix things ups and set cpu_possible_mask
-         * if required.
         */
+        cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
+        if (cpu >= nr_cpu_ids) {
+                /*
+                 * Either tsk->cpus_allowed is wrong (see above) or it
+                 * is actually empty. The latter case is only possible
+                 * if we are racing with remove_tasks_in_empty_cpuset().
+                 * Like above we can temporary set any mask and rely on
+                 * set_cpus_allowed_ptr() as synchronization point.
+                 */
+                do_set_cpus_allowed(tsk, cpu_possible_mask);
+                cpu = cpumask_any(cpu_active_mask);
+        }
+        return cpu;
 }
 void cpuset_init_current_mems_allowed(void)
@@ -2237,7 +2235,7 @@ void cpuset_init_current_mems_allowed(void)
 *
 * Description: Returns the nodemask_t mems_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
- * subset of node_states[N_MEMORY], even if this means going outside the
+ * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
 * tasks cpuset.
 **/
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index c766ee54c0b..5f85690285d 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -2,7 +2,7 @@
 #include <linux/crash_dump.h>
 #include <linux/init.h>
 #include <linux/errno.h>
-#include <linux/export.h>
+#include <linux/module.h>
 /*
 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -20,15 +20,8 @@ unsigned long saved_max_pfn;
 unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
 /*
- * stores the size of elf header of crash image
- */
-unsigned long long elfcorehdr_size;
-/*
 * elfcorehdr= specifies the location of elf core header stored by the crashed
 * kernel. This option will be passed by kexec loader to the capture kernel.
- *
- * Syntax: elfcorehdr=[size[KMG]@]offset[KMG]
 */
 static int __init setup_elfcorehdr(char *arg)
 {
@@ -36,10 +29,6 @@ static int __init setup_elfcorehdr(char *arg)
        if (!arg)
                return -EINVAL;
        elfcorehdr_addr = memparse(arg, &end);
-        if (*end == '@') {
-                elfcorehdr_size = elfcorehdr_addr;
-                elfcorehdr_addr = memparse(end + 1, &end);
-        }
        return end > arg ? 0 : -EINVAL;
 }
 early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index e0573a43c7d..8ef31f53c44 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -8,7 +8,7 @@
 * as published by the Free Software Foundation; either version
 * 2 of the Licence, or (at your option) any later version.
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/cred.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
@@ -16,7 +16,6 @@
 #include <linux/keyctl.h>
 #include <linux/init_task.h>
 #include <linux/security.h>
-#include <linux/binfmts.h>
 #include <linux/cn_proc.h>
 #if 0
@@ -30,6 +29,17 @@
 static struct kmem_cache *cred_jar;
 /*
+ * The common credentials for the initial task's thread group
+ */
+#ifdef CONFIG_KEYS
+static struct thread_group_cred init_tgcred = {
+        .usage  = ATOMIC_INIT(2),
+        .tgid   = 0,
+        .lock   = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
+};
+#endif
+/*
 * The initial credentials for the initial task
 */
 struct cred init_cred = {
@@ -38,14 +48,6 @@ struct cred init_cred = {
        .subscribers            = ATOMIC_INIT(2),
        .magic                  = CRED_MAGIC,
 #endif
-        .uid                    = GLOBAL_ROOT_UID,
-        .gid                    = GLOBAL_ROOT_GID,
-        .suid                   = GLOBAL_ROOT_UID,
-        .sgid                   = GLOBAL_ROOT_GID,
-        .euid                   = GLOBAL_ROOT_UID,
-        .egid                   = GLOBAL_ROOT_GID,
-        .fsuid                  = GLOBAL_ROOT_UID,
-        .fsgid                  = GLOBAL_ROOT_GID,
        .securebits             = SECUREBITS_DEFAULT,
        .cap_inheritable        = CAP_EMPTY_SET,
        .cap_permitted          = CAP_FULL_SET,
@@ -54,6 +56,9 @@ struct cred init_cred = {
        .user                   = INIT_USER,
        .user_ns                = &init_user_ns,
        .group_info             = &init_groups,
+#ifdef CONFIG_KEYS
+        .tgcred                 = &init_tgcred,
+#endif
 };
 static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -82,6 +87,36 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n)
 }
 /*
+ * Dispose of the shared task group credentials
+ */
+#ifdef CONFIG_KEYS
+static void release_tgcred_rcu(struct rcu_head *rcu)
+{
+        struct thread_group_cred *tgcred =
+                container_of(rcu, struct thread_group_cred, rcu);
+        BUG_ON(atomic_read(&tgcred->usage) != 0);
+        key_put(tgcred->session_keyring);
+        key_put(tgcred->process_keyring);
+        kfree(tgcred);
+}
+#endif
+/*
+ * Release a set of thread group credentials.
+ */
+static void release_tgcred(struct cred *cred)
+{
+#ifdef CONFIG_KEYS
+        struct thread_group_cred *tgcred = cred->tgcred;
+        if (atomic_dec_and_test(&tgcred->usage))
+                call_rcu(&tgcred->rcu, release_tgcred_rcu);
+#endif
+}
+/*
 * The RCU callback to actually dispose of a set of credentials
 */
 static void put_cred_rcu(struct rcu_head *rcu)
@@ -106,14 +141,12 @@ static void put_cred_rcu(struct rcu_head *rcu)
 #endif
        security_cred_free(cred);
-        key_put(cred->session_keyring);
-        key_put(cred->process_keyring);
        key_put(cred->thread_keyring);
        key_put(cred->request_key_auth);
+        release_tgcred(cred);
        if (cred->group_info)
                put_group_info(cred->group_info);
        free_uid(cred->user);
-        put_user_ns(cred->user_ns);
        kmem_cache_free(cred_jar, cred);
 }
@@ -164,6 +197,13 @@ void exit_creds(struct task_struct *tsk)
        validate_creds(cred);
        alter_cred_subscribers(cred, -1);
        put_cred(cred);
+        cred = (struct cred *) tsk->replacement_session_keyring;
+        if (cred) {
+                tsk->replacement_session_keyring = NULL;
+                validate_creds(cred);
+                put_cred(cred);
+        }
 }
 /**
@@ -203,6 +243,15 @@ struct cred *cred_alloc_blank(void)
        if (!new)
                return NULL;
+#ifdef CONFIG_KEYS
+        new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
+        if (!new->tgcred) {
+                kmem_cache_free(cred_jar, new);
+                return NULL;
+        }
+        atomic_set(&new->tgcred->usage, 1);
+#endif
        atomic_set(&new->usage, 1);
 #ifdef CONFIG_DEBUG_CREDENTIALS
        new->magic = CRED_MAGIC;
@@ -253,13 +302,11 @@ struct cred *prepare_creds(void)
        set_cred_subscribers(new, 0);
        get_group_info(new->group_info);
        get_uid(new->user);
-        get_user_ns(new->user_ns);
 #ifdef CONFIG_KEYS
-        key_get(new->session_keyring);
-        key_get(new->process_keyring);
        key_get(new->thread_keyring);
        key_get(new->request_key_auth);
+        atomic_inc(&new->tgcred->usage);
 #endif
 #ifdef CONFIG_SECURITY
@@ -283,20 +330,39 @@ EXPORT_SYMBOL(prepare_creds);
 */
 struct cred *prepare_exec_creds(void)
 {
+        struct thread_group_cred *tgcred = NULL;
        struct cred *new;
+#ifdef CONFIG_KEYS
+        tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+        if (!tgcred)
+                return NULL;
+#endif
        new = prepare_creds();
-        if (!new)
+        if (!new) {
+                kfree(tgcred);
                return new;
+        }
 #ifdef CONFIG_KEYS
        /* newly exec'd tasks don't get a thread keyring */
        key_put(new->thread_keyring);
        new->thread_keyring = NULL;
+        /* create a new per-thread-group creds for all this set of threads to
+         * share */
+        memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
+        atomic_set(&tgcred->usage, 1);
+        spin_lock_init(&tgcred->lock);
        /* inherit the session keyring; new process keyring */
-        key_put(new->process_keyring);
+        key_get(tgcred->session_keyring);
-        new->process_keyring = NULL;
+        tgcred->process_keyring = NULL;
+        release_tgcred(new);
+        new->tgcred = tgcred;
 #endif
        return new;
@@ -313,6 +379,9 @@ struct cred *prepare_exec_creds(void)
 */
 int copy_creds(struct task_struct *p, unsigned long clone_flags)
 {
+#ifdef CONFIG_KEYS
+        struct thread_group_cred *tgcred;
+#endif
        struct cred *new;
        int ret;
@@ -342,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
                        goto error_put;
        }
+        /* cache user_ns in cred.  Doesn't need a refcount because it will
+         * stay pinned by cred->user
+         */
+        new->user_ns = new->user->user_ns;
 #ifdef CONFIG_KEYS
        /* new threads get their own thread keyrings if their parent already
         * had one */
@@ -352,12 +426,22 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
                        install_thread_keyring_to_cred(new);
        }
-        /* The process keyring is only shared between the threads in a process;
+        /* we share the process and session keyrings between all the threads in
-         * anything outside of those threads doesn't inherit.
+         * a process - this is slightly icky as we violate COW credentials a
-         */
+         * bit */
        if (!(clone_flags & CLONE_THREAD)) {
-                key_put(new->process_keyring);
+                tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
-                new->process_keyring = NULL;
+                if (!tgcred) {
+                        ret = -ENOMEM;
+                        goto error_put;
+                }
+                atomic_set(&tgcred->usage, 1);
+                spin_lock_init(&tgcred->lock);
+                tgcred->process_keyring = NULL;
+                tgcred->session_keyring = key_get(new->tgcred->session_keyring);
+                release_tgcred(new);
+                new->tgcred = tgcred;
        }
 #endif
@@ -372,31 +456,6 @@ error_put:
        return ret;
 }
-static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
-{
-        const struct user_namespace *set_ns = set->user_ns;
-        const struct user_namespace *subset_ns = subset->user_ns;
-        /* If the two credentials are in the same user namespace see if
-         * the capabilities of subset are a subset of set.
-         */
-        if (set_ns == subset_ns)
-                return cap_issubset(subset->cap_permitted, set->cap_permitted);
-        /* The credentials are in a different user namespaces
-         * therefore one is a subset of the other only if a set is an
-         * ancestor of subset and set->euid is owner of subset or one
-         * of subsets ancestors.
-         */
-        for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
-                if ((set_ns == subset_ns->parent)  &&
-                    uid_eq(subset_ns->owner, set->euid))
-                        return true;
-        }
-        return false;
-}
 /**
 * commit_creds - Install new credentials upon the current task
 * @new: The credentials to be assigned
@@ -431,11 +490,11 @@ int commit_creds(struct cred *new)
        get_cred(new); /* we will require a ref for the subj creds too */
        /* dumpability changes */
-        if (!uid_eq(old->euid, new->euid) ||
+        if (old->euid != new->euid ||
-            !gid_eq(old->egid, new->egid) ||
+            old->egid != new->egid ||
-            !uid_eq(old->fsuid, new->fsuid) ||
+            old->fsuid != new->fsuid ||
-            !gid_eq(old->fsgid, new->fsgid) ||
+            old->fsgid != new->fsgid ||
-            !cred_cap_issubset(old, new)) {
+            !cap_issubset(new->cap_permitted, old->cap_permitted)) {
                if (task->mm)
                        set_dumpable(task->mm, suid_dumpable);
                task->pdeath_signal = 0;
@@ -443,9 +502,9 @@ int commit_creds(struct cred *new)
        }
        /* alter the thread keyring */
-        if (!uid_eq(new->fsuid, old->fsuid))
+        if (new->fsuid != old->fsuid)
                key_fsuid_changed(task);
-        if (!gid_eq(new->fsgid, old->fsgid))
+        if (new->fsgid != old->fsgid)
                key_fsgid_changed(task);
        /* do it
@@ -462,16 +521,16 @@ int commit_creds(struct cred *new)
        alter_cred_subscribers(old, -2);
        /* send notifications */
-        if (!uid_eq(new->uid,   old->uid)  ||
+        if (new->uid   != old->uid  ||
-            !uid_eq(new->euid,  old->euid) ||
+            new->euid  != old->euid ||
-            !uid_eq(new->suid,  old->suid) ||
+            new->suid  != old->suid ||
-            !uid_eq(new->fsuid, old->fsuid))
+            new->fsuid != old->fsuid)
                proc_id_connector(task, PROC_EVENT_UID);
-        if (!gid_eq(new->gid,   old->gid)  ||
+        if (new->gid   != old->gid  ||
-            !gid_eq(new->egid,  old->egid) ||
+            new->egid  != old->egid ||
-            !gid_eq(new->sgid,  old->sgid) ||
+            new->sgid  != old->sgid ||
-            !gid_eq(new->fsgid, old->fsgid))
+            new->fsgid != old->fsgid)
                proc_id_connector(task, PROC_EVENT_GID);
        /* release the old obj and subj refs both */
@@ -605,14 +664,13 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
        atomic_set(&new->usage, 1);
        set_cred_subscribers(new, 0);
        get_uid(new->user);
-        get_user_ns(new->user_ns);
        get_group_info(new->group_info);
 #ifdef CONFIG_KEYS
-        new->session_keyring = NULL;
+        atomic_inc(&init_tgcred.usage);
-        new->process_keyring = NULL;
+        new->tgcred = &init_tgcred;
-        new->thread_keyring = NULL;
        new->request_key_auth = NULL;
+        new->thread_keyring = NULL;
        new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
 #endif
@@ -727,15 +785,9 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
               atomic_read(&cred->usage),
               read_cred_subscribers(cred));
        printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
-                from_kuid_munged(&init_user_ns, cred->uid),
+               cred->uid, cred->euid, cred->suid, cred->fsuid);
-                from_kuid_munged(&init_user_ns, cred->euid),
-                from_kuid_munged(&init_user_ns, cred->suid),
-                from_kuid_munged(&init_user_ns, cred->fsuid));
        printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
-                from_kgid_munged(&init_user_ns, cred->gid),
+               cred->gid, cred->egid, cred->sgid, cred->fsgid);
-                from_kgid_munged(&init_user_ns, cred->egid),
-                from_kgid_munged(&init_user_ns, cred->sgid),
-                from_kgid_munged(&init_user_ns, cred->fsgid));
 #ifdef CONFIG_SECURITY
        printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
        if ((unsigned long) cred->security >= PAGE_SIZE &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9a61738cefc..0d7c08784ef 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -41,7 +41,6 @@
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/sysrq.h>
-#include <linux/reboot.h>
 #include <linux/init.h>
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
@@ -53,6 +52,7 @@
 #include <asm/cacheflush.h>
 #include <asm/byteorder.h>
 #include <linux/atomic.h>
+#include <asm/system.h>
 #include "debug_core.h"
@@ -75,8 +75,6 @@ static int			exception_level;
 struct kgdb_io          *dbg_io_ops;
 static DEFINE_SPINLOCK(kgdb_registration_lock);
-/* Action for the reboot notifiter, a global allow kdb to change it */
-static int kgdbreboot;
 /* kgdb console driver is loaded */
 static int kgdb_con_registered;
 /* determine if kgdb console output should be used */
@@ -98,7 +96,6 @@ static int __init opt_kgdb_con(char *str)
 early_param("kgdbcon", opt_kgdb_con);
 module_param(kgdb_use_con, int, 0644);
-module_param(kgdbreboot, int, 0644);
 /*
 * Holds information about breakpoints in a kernel. These breakpoints are
@@ -160,39 +157,37 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
 * Weak aliases for breakpoint management,
 * can be overriden by architectures when needed:
 */
-int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
 {
        int err;
-        err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+        err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
-                                BREAK_INSTR_SIZE);
        if (err)
                return err;
-        err = probe_kernel_write((char *)bpt->bpt_addr,
-                                 arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+        return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
-        return err;
+                                  BREAK_INSTR_SIZE);
 }
-int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
 {
-        return probe_kernel_write((char *)bpt->bpt_addr,
+        return probe_kernel_write((char *)addr,
-                                  (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
+                                  (char *)bundle, BREAK_INSTR_SIZE);
 }
 int __weak kgdb_validate_break_address(unsigned long addr)
 {
-        struct kgdb_bkpt tmp;
+        char tmp_variable[BREAK_INSTR_SIZE];
        int err;
-        /* Validate setting the breakpoint and then removing it.  If the
+        /* Validate setting the breakpoint and then removing it.  In the
         * remove fails, the kernel needs to emit a bad message because we
         * are deep trouble not being able to put things back the way we
         * found them.
         */
-        tmp.bpt_addr = addr;
+        err = kgdb_arch_set_breakpoint(addr, tmp_variable);
-        err = kgdb_arch_set_breakpoint(&tmp);
        if (err)
                return err;
-        err = kgdb_arch_remove_breakpoint(&tmp);
+        err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
        if (err)
                printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
                   "memory destroyed at: %lx", addr);
@@ -236,6 +231,7 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
 */
 int dbg_activate_sw_breakpoints(void)
 {
+        unsigned long addr;
        int error;
        int ret = 0;
        int i;
@@ -244,15 +240,16 @@ int dbg_activate_sw_breakpoints(void)
                if (kgdb_break[i].state != BP_SET)
                        continue;
-                error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
+                addr = kgdb_break[i].bpt_addr;
+                error = kgdb_arch_set_breakpoint(addr,
+                                kgdb_break[i].saved_instr);
                if (error) {
                        ret = error;
-                        printk(KERN_INFO "KGDB: BP install failed: %lx",
+                        printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
-                               kgdb_break[i].bpt_addr);
                        continue;
                }
-                kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
+                kgdb_flush_swbreak_addr(addr);
                kgdb_break[i].state = BP_ACTIVE;
        }
        return ret;
@@ -301,6 +298,7 @@ int dbg_set_sw_break(unsigned long addr)
 int dbg_deactivate_sw_breakpoints(void)
 {
+        unsigned long addr;
        int error;
        int ret = 0;
        int i;
@@ -308,14 +306,15 @@ int dbg_deactivate_sw_breakpoints(void)
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
                if (kgdb_break[i].state != BP_ACTIVE)
                        continue;
-                error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
+                addr = kgdb_break[i].bpt_addr;
+                error = kgdb_arch_remove_breakpoint(addr,
+                                        kgdb_break[i].saved_instr);
                if (error) {
-                        printk(KERN_INFO "KGDB: BP remove failed: %lx\n",
+                        printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
-                               kgdb_break[i].bpt_addr);
                        ret = error;
                }
-                kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
+                kgdb_flush_swbreak_addr(addr);
                kgdb_break[i].state = BP_SET;
        }
        return ret;
@@ -349,6 +348,7 @@ int kgdb_isremovedbreak(unsigned long addr)
 int dbg_remove_all_break(void)
 {
+        unsigned long addr;
        int error;
        int i;
@@ -356,10 +356,12 @@ int dbg_remove_all_break(void)
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
                if (kgdb_break[i].state != BP_ACTIVE)
                        goto setundefined;
-                error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
+                addr = kgdb_break[i].bpt_addr;
+                error = kgdb_arch_remove_breakpoint(addr,
+                                kgdb_break[i].saved_instr);
                if (error)
                        printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
-                               kgdb_break[i].bpt_addr);
+                           addr);
 setundefined:
                kgdb_break[i].state = BP_UNDEFINED;
        }
@@ -672,10 +674,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
 {
        struct kgdb_state kgdb_var;
        struct kgdb_state *ks = &kgdb_var;
-        int ret = 0;
-        if (arch_kgdb_ops.enable_nmi)
-                arch_kgdb_ops.enable_nmi(0);
        ks->cpu                 = raw_smp_processor_id();
        ks->ex_vector           = evector;
@@ -685,33 +683,13 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
        ks->linux_regs          = regs;
        if (kgdb_reenter_check(ks))
-                goto out; /* Ouch, double exception ! */
+                return 0; /* Ouch, double exception ! */
        if (kgdb_info[ks->cpu].enter_kgdb != 0)
-                goto out;
+                return 0;
-        ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
-out:
-        if (arch_kgdb_ops.enable_nmi)
-                arch_kgdb_ops.enable_nmi(1);
-        return ret;
-}
-/*
- * GDB places a breakpoint at this function to know dynamically
- * loaded objects. It's not defined static so that only one instance with this
- * name exists in the kernel.
- */
-static int module_event(struct notifier_block *self, unsigned long val,
+        return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
-        void *data)
-{
-        return 0;
 }
-static struct notifier_block dbg_module_load_nb = {
-        .notifier_call  = module_event,
-};
 int kgdb_nmicallback(int cpu, void *regs)
 {
 #ifdef CONFIG_SMP
@@ -806,33 +784,6 @@ void __init dbg_late_init(void)
        kdb_init(KDB_INIT_FULL);
 }
-static int
-dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
-{
-        /*
-         * Take the following action on reboot notify depending on value:
-         *    1 == Enter debugger
-         *    0 == [the default] detatch debug client
-         *   -1 == Do nothing... and use this until the board resets
-         */
-        switch (kgdbreboot) {
-        case 1:
-                kgdb_breakpoint();
-        case -1:
-                goto done;
-        }
-        if (!dbg_kdb_mode)
-                gdbstub_exit(code);
-done:
-        return NOTIFY_DONE;
-}
-static struct notifier_block dbg_reboot_notifier = {
-        .notifier_call          = dbg_notify_reboot,
-        .next                   = NULL,
-        .priority               = INT_MAX,
-};
 static void kgdb_register_callbacks(void)
 {
        if (!kgdb_io_module_registered) {
@@ -840,8 +791,6 @@ static void kgdb_register_callbacks(void)
                kgdb_arch_init();
                if (!dbg_is_early)
                        kgdb_arch_late();
-                register_module_notifier(&dbg_module_load_nb);
-                register_reboot_notifier(&dbg_reboot_notifier);
                atomic_notifier_chain_register(&panic_notifier_list,
                                               &kgdb_panic_event_nb);
 #ifdef CONFIG_MAGIC_SYSRQ
@@ -863,8 +812,6 @@ static void kgdb_unregister_callbacks(void)
         */
        if (kgdb_io_module_registered) {
                kgdb_io_module_registered = 0;
-                unregister_reboot_notifier(&dbg_reboot_notifier);
-                unregister_module_notifier(&dbg_module_load_nb);
                atomic_notifier_chain_unregister(&panic_notifier_list,
                                               &kgdb_panic_event_nb);
                kgdb_arch_exit();
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ce615e06448..34872482315 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -217,7 +217,7 @@ void gdbstub_msg_write(const char *s, int len)
                /* Pack in hex chars */
                for (i = 0; i < wcount; i++)
-                        bufptr = hex_byte_pack(bufptr, s[i]);
+                        bufptr = pack_hex_byte(bufptr, s[i]);
                *bufptr = '\0';
                /* Move up */
@@ -249,7 +249,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count)
        if (err)
                return NULL;
        while (count > 0) {
-                buf = hex_byte_pack(buf, *tmp);
+                buf = pack_hex_byte(buf, *tmp);
                tmp++;
                count--;
        }
@@ -411,14 +411,14 @@ static char *pack_threadid(char *pkt, unsigned char *id)
        limit = id + (BUF_THREAD_ID_SIZE / 2);
        while (id < limit) {
                if (!lzero || *id != 0) {
-                        pkt = hex_byte_pack(pkt, *id);
+                        pkt = pack_hex_byte(pkt, *id);
                        lzero = 0;
                }
                id++;
        }
        if (lzero)
-                pkt = hex_byte_pack(pkt, 0);
+                pkt = pack_hex_byte(pkt, 0);
        return pkt;
 }
@@ -486,7 +486,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
        dbg_remove_all_break();
        remcom_out_buffer[0] = 'S';
-        hex_byte_pack(&remcom_out_buffer[1], ks->signo);
+        pack_hex_byte(&remcom_out_buffer[1], ks->signo);
 }
 static void gdb_get_regs_helper(struct kgdb_state *ks)
@@ -954,7 +954,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
                /* Reply to host that an exception has occurred */
                ptr = remcom_out_buffer;
                *ptr++ = 'T';
-                ptr = hex_byte_pack(ptr, ks->signo);
+                ptr = pack_hex_byte(ptr, ks->signo);
                ptr += strlen(strcpy(ptr, "thread:"));
                int_to_threadref(thref, shadow_pid(current->pid));
                ptr = pack_threadid(ptr, thref);
@@ -1111,13 +1111,6 @@ void gdbstub_exit(int status)
        unsigned char checksum, ch, buffer[3];
        int loop;
-        if (!kgdb_connected)
-                return;
-        kgdb_connected = 0;
-        if (!dbg_io_ops || dbg_kdb_mode)
-                return;
        buffer[0] = 'W';
        buffer[1] = hex_asc_hi(status);
        buffer[2] = hex_asc_lo(status);
@@ -1136,6 +1129,5 @@ void gdbstub_exit(int status)
        dbg_io_ops->write_char(hex_asc_lo(checksum));
        /* make sure the output is flushed, lest the bootloader clobber it */
-        if (dbg_io_ops->flush)
+        dbg_io_ops->flush();
-                dbg_io_ops->flush();
 }
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 8418c2f8ec5..20059ef4459 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,13 +153,6 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
        } else {
                kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
                           __func__, bp->bp_addr);
-#ifdef CONFIG_DEBUG_RODATA
-                if (!bp->bp_type) {
-                        kdb_printf("Software breakpoints are unavailable.\n"
-                                   "  Change the kernel CONFIG_DEBUG_RODATA=n\n"
-                                   "  OR use hw breaks: help bph\n");
-                }
-#endif
                return 1;
        }
        return 0;
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index b03e0e814e4..7179eac7b41 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/kdb.h>
 #include <linux/nmi.h>
+#include <asm/system.h>
 #include "kdb_private.h"
@@ -129,8 +130,6 @@ kdb_bt(int argc, const char **argv)
                }
                /* Now the inactive tasks */
                kdb_do_each_thread(g, p) {
-                        if (KDB_FLAG(CMD_INTERRUPT))
-                                return 0;
                        if (task_curr(p))
                                continue;
                        if (kdb_bt1(p, mask, argcount, btaprompt))
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index be7b33b73d3..d9ca9aa481e 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -11,8 +11,6 @@
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
 #include <linux/kdebug.h>
-#include <linux/export.h>
-#include <linux/hardirq.h>
 #include "kdb_private.h"
 #include "../debug_core.h"
@@ -53,9 +51,6 @@ int kdb_stub(struct kgdb_state *ks)
        if (atomic_read(&kgdb_setting_breakpoint))
                reason = KDB_REASON_KEYBOARD;
-        if (in_nmi())
-                reason = KDB_REASON_NMI;
        for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
                if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
                        reason = KDB_REASON_BREAK;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 14ff4849262..4802eb5840e 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -552,7 +552,6 @@ int vkdb_printf(const char *fmt, va_list ap)
 {
        int diag;
        int linecount;
-        int colcount;
        int logging, saved_loglevel = 0;
        int saved_trap_printk;
        int got_printf_lock = 0;
@@ -585,10 +584,6 @@ int vkdb_printf(const char *fmt, va_list ap)
        if (diag || linecount <= 1)
                linecount = 24;
-        diag = kdbgetintenv("COLUMNS", &colcount);
-        if (diag || colcount <= 1)
-                colcount = 80;
        diag = kdbgetintenv("LOGGING", &logging);
        if (diag)
                logging = 0;
@@ -694,8 +689,8 @@ kdb_printit:
        if (!dbg_kdb_mode && kgdb_connected) {
                gdbstub_msg_write(kdb_buffer, retlen);
        } else {
-                if (dbg_io_ops && !dbg_io_ops->is_console) {
+                if (!dbg_io_ops->is_console) {
-                        len = retlen;
+                        len = strlen(kdb_buffer);
                        cp = kdb_buffer;
                        while (len--) {
                                dbg_io_ops->write_char(*cp);
@@ -714,30 +709,15 @@ kdb_printit:
                printk(KERN_INFO "%s", kdb_buffer);
        }
-        if (KDB_STATE(PAGER)) {
+        if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
-                /*
+                kdb_nextline++;
-                 * Check printed string to decide how to bump the
-                 * kdb_nextline to control when the more prompt should
-                 * show up.
-                 */
-                int got = 0;
-                len = retlen;
-                while (len--) {
-                        if (kdb_buffer[len] == '\n') {
-                                kdb_nextline++;
-                                got = 0;
-                        } else if (kdb_buffer[len] == '\r') {
-                                got = 0;
-                        } else {
-                                got++;
-                        }
-                }
-                kdb_nextline += got / (colcount + 1);
-        }
        /* check for having reached the LINES number of printed lines */
-        if (kdb_nextline >= linecount) {
+        if (kdb_nextline == linecount) {
                char buf1[16] = "";
+#if defined(CONFIG_SMP)
+                char buf2[32];
+#endif
                /* Watch out for recursion here.  Any routine that calls
                 * kdb_printf will come back through here.  And kdb_read
@@ -752,10 +732,18 @@ kdb_printit:
                if (moreprompt == NULL)
                        moreprompt = "more> ";
+#if defined(CONFIG_SMP)
+                if (strchr(moreprompt, '%')) {
+                        sprintf(buf2, moreprompt, get_cpu());
+                        put_cpu();
+                        moreprompt = buf2;
+                }
+#endif
                kdb_input_flush();
                c = console_drivers;
-                if (dbg_io_ops && !dbg_io_ops->is_console) {
+                if (!dbg_io_ops->is_console) {
                        len = strlen(moreprompt);
                        cp = moreprompt;
                        while (len--) {
@@ -788,7 +776,7 @@ kdb_printit:
                        kdb_grepping_flag = 0;
                        kdb_printf("\n");
                } else if (buf1[0] == ' ') {
-                        kdb_printf("\r");
+                        kdb_printf("\n");
                        suspend_grep = 1; /* for this recursion */
                } else if (buf1[0] == '\n') {
                        kdb_nextline = linecount - 1;
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 118527aa60e..4bca634975c 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,7 +25,6 @@
 #define KBD_STAT_MOUSE_OBF      0x20    /* Mouse output buffer full */
 static int kbd_exists;
-static int kbd_last_ret;
 /*
 * Check if the keyboard controller has a keypress for us.
@@ -91,11 +90,8 @@ int kdb_get_kbd_char(void)
                return -1;
        }
-        if ((scancode & 0x80) != 0) {
+        if ((scancode & 0x80) != 0)
-                if (scancode == 0x9c)
-                        kbd_last_ret = 0;
                return -1;
-        }
        scancode &= 0x7f;
@@ -182,82 +178,35 @@ int kdb_get_kbd_char(void)
                return -1;      /* ignore unprintables */
        }
-        if (scancode == 0x1c) {
+        if ((scancode & 0x7f) == 0x1c) {
-                kbd_last_ret = 1;
+                /*
-                return 13;
+                 * enter key.  All done.  Absorb the release scancode.
-        }
+                 */
-        return keychar & 0xff;
-}
-EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
-/*
- * Best effort cleanup of ENTER break codes on leaving KDB. Called on
- * exiting KDB, when we know we processed an ENTER or KP ENTER scan
- * code.
- */
-void kdb_kbd_cleanup_state(void)
-{
-        int scancode, scanstatus;
-        /*
-         * Nothing to clean up, since either
-         * ENTER was never pressed, or has already
-         * gotten cleaned up.
-         */
-        if (!kbd_last_ret)
-                return;
-        kbd_last_ret = 0;
-        /*
-         * Enter key. Need to absorb the break code here, lest it gets
-         * leaked out if we exit KDB as the result of processing 'g'.
-         *
-         * This has several interesting implications:
-         * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
-         * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
-         *   only get a break code at the end of the repeated
-         *   sequence. This means we can't propagate the repeated key
-         *   press, and must swallow it away.
-         * + Need to handle possible PS/2 mouse input.
-         * + Need to handle mashed keys.
-         */
-        while (1) {
                while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
-                        cpu_relax();
+                        ;
                /*
-                 * Fetch the scancode.
+                 * Fetch the scancode
                 */
                scancode = inb(KBD_DATA_REG);
                scanstatus = inb(KBD_STATUS_REG);
-                /*
+                while (scanstatus & KBD_STAT_MOUSE_OBF) {
-                 * Skip mouse input.
+                        scancode = inb(KBD_DATA_REG);
-                 */
+                        scanstatus = inb(KBD_STATUS_REG);
-                if (scanstatus & KBD_STAT_MOUSE_OBF)
+                }
-                        continue;
-                /*
+                if (scancode != 0x9c) {
-                 * If we see 0xe0, this is either a break code for KP
+                        /*
-                 * ENTER, or a repeat make for KP ENTER. Either way,
+                         * Wasn't an enter-release,  why not?
-                 * since the second byte is equivalent to an ENTER,
+                         */
-                 * skip the 0xe0 and try again.
+                        kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
-                 *
+                               scancode, scanstatus);
-                 * If we see 0x1c, this must be a repeat ENTER or KP
+                }
-                 * ENTER (and we swallowed 0xe0 before). Try again.
-                 *
-                 * We can also see make and break codes for other keys
-                 * mashed before or after pressing ENTER. Thus, if we
-                 * see anything other than 0x9c, we have to try again.
-                 *
-                 * Note, if you held some key as ENTER was depressed,
-                 * that break code would get leaked out.
-                 */
-                if (scancode != 0x9c)
-                        continue;
-                return;
+                return 13;
        }
+        return keychar & 0xff;
 }
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 4d5f8d5612f..63786e71a3c 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,14 +14,12 @@
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
-#include <linux/kmsg_dump.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
 #include <linux/sysrq.h>
 #include <linux/smp.h>
 #include <linux/utsname.h>
 #include <linux/vmalloc.h>
-#include <linux/atomic.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/init.h>
@@ -140,10 +138,11 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
 static char *__env[] = {
 #if defined(CONFIG_SMP)
 "PROMPT=[%d]kdb> ",
+ "MOREPROMPT=[%d]more> ",
 #else
 "PROMPT=kdb> ",
-#endif
 "MOREPROMPT=more> ",
+#endif
 "RADIX=16",
 "MDCOUNT=8",                   /* lines of md output */
 KDB_PLATFORM_ENV,
@@ -1236,6 +1235,18 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                *cmdbuf = '\0';
                *(cmd_hist[cmd_head]) = '\0';
+                if (KDB_FLAG(ONLY_DO_DUMP)) {
+                        /* kdb is off but a catastrophic error requires a dump.
+                         * Take the dump and reboot.
+                         * Turn on logging so the kdb output appears in the log
+                         * buffer in the dump.
+                         */
+                        const char *setargs[] = { "set", "LOGGING", "1" };
+                        kdb_set(2, setargs);
+                        kdb_reboot(0, NULL);
+                        /*NOTREACHED*/
+                }
 do_full_getstr:
 #if defined(CONFIG_SMP)
                snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
@@ -1389,9 +1400,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
        if (KDB_STATE(DOING_SS))
                KDB_STATE_CLEAR(SSBPT);
-        /* Clean up any keyboard devices before leaving */
-        kdb_kbd_cleanup_state();
        return result;
 }
@@ -1974,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv)
                kdb_printf("%-20s%8u  0x%p ", mod->name,
                           mod->core_size, (void *)mod);
 #ifdef CONFIG_MODULE_UNLOAD
-                kdb_printf("%4ld ", module_refcount(mod));
+                kdb_printf("%4d ", module_refcount(mod));
 #endif
                if (mod->state == MODULE_STATE_GOING)
                        kdb_printf(" (Unloading)");
@@ -2029,15 +2037,8 @@ static int kdb_env(int argc, const char **argv)
 */
 static int kdb_dmesg(int argc, const char **argv)
 {
-        int diag;
+        char *syslog_data[4], *start, *end, c = '\0', *p;
-        int logging;
+        int diag, logging, logsize, lines = 0, adjust = 0, n;
-        int lines = 0;
-        int adjust = 0;
-        int n = 0;
-        int skip = 0;
-        struct kmsg_dumper dumper = { .active = 1 };
-        size_t len;
-        char buf[201];
        if (argc > 2)
                return KDB_ARGCOUNT;
@@ -2060,10 +2061,22 @@ static int kdb_dmesg(int argc, const char **argv)
                kdb_set(2, setargs);
        }
-        kmsg_dump_rewind_nolock(&dumper);
+        /* syslog_data[0,1] physical start, end+1.  syslog_data[2,3]
-        while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
+         * logical start, end+1. */
-                n++;
+        kdb_syslog_data(syslog_data);
+        if (syslog_data[2] == syslog_data[3])
+                return 0;
+        logsize = syslog_data[1] - syslog_data[0];
+        start = syslog_data[2];
+        end = syslog_data[3];
+#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
+        for (n = 0, p = start; p < end; ++p) {
+                c = *KDB_WRAP(p);
+                if (c == '\n')
+                        ++n;
+        }
+        if (c != '\n')
+                ++n;
        if (lines < 0) {
                if (adjust >= n)
                        kdb_printf("buffer only contains %d lines, nothing "
@@ -2071,11 +2084,21 @@ static int kdb_dmesg(int argc, const char **argv)
                else if (adjust - lines >= n)
                        kdb_printf("buffer only contains %d lines, last %d "
                                   "lines printed\n", n, n - adjust);
-                skip = adjust;
+                if (adjust) {
-                lines = abs(lines);
+                        for (; start < end && adjust; ++start) {
+                                if (*KDB_WRAP(start) == '\n')
+                                        --adjust;
+                        }
+                        if (start < end)
+                                ++start;
+                }
+                for (p = start; p < end && lines; ++p) {
+                        if (*KDB_WRAP(p) == '\n')
+                                ++lines;
+                }
+                end = p;
        } else if (lines > 0) {
-                skip = n - lines - adjust;
+                int skip = n - (adjust + lines);
-                lines = abs(lines);
                if (adjust >= n) {
                        kdb_printf("buffer only contains %d lines, "
                                   "nothing printed\n", n);
@@ -2086,56 +2109,39 @@ static int kdb_dmesg(int argc, const char **argv)
                        kdb_printf("buffer only contains %d lines, first "
                                   "%d lines printed\n", n, lines);
                }
-        } else {
+                for (; start < end && skip; ++start) {
-                lines = n;
+                        if (*KDB_WRAP(start) == '\n')
-        }
+                                --skip;
-        if (skip >= n || skip < 0)
-                return 0;
-        kmsg_dump_rewind_nolock(&dumper);
-        while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
-                if (skip) {
-                        skip--;
-                        continue;
                }
-                if (!lines--)
+                for (p = start; p < end && lines; ++p) {
-                        break;
+                        if (*KDB_WRAP(p) == '\n')
+                                --lines;
+                }
+                end = p;
+        }
+        /* Do a line at a time (max 200 chars) to reduce protocol overhead */
+        c = '\n';
+        while (start != end) {
+                char buf[201];
+                p = buf;
                if (KDB_FLAG(CMD_INTERRUPT))
                        return 0;
+                while (start < end && (c = *KDB_WRAP(start)) &&
-                kdb_printf("%.*s\n", (int)len - 1, buf);
+                       (p - buf) < sizeof(buf)-1) {
+                        ++start;
+                        *p++ = c;
+                        if (c == '\n')
+                                break;
+                }
+                *p = '\0';
+                kdb_printf("%s", buf);
        }
+        if (c != '\n')
+                kdb_printf("\n");
        return 0;
 }
 #endif /* CONFIG_PRINTK */
-/* Make sure we balance enable/disable calls, must disable first. */
-static atomic_t kdb_nmi_disabled;
-static int kdb_disable_nmi(int argc, const char *argv[])
-{
-        if (atomic_read(&kdb_nmi_disabled))
-                return 0;
-        atomic_set(&kdb_nmi_disabled, 1);
-        arch_kgdb_ops.enable_nmi(0);
-        return 0;
-}
-static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
-{
-        if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
-                return -EINVAL;
-        arch_kgdb_ops.enable_nmi(1);
-        return 0;
-}
-static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
-        .set = kdb_param_enable_nmi,
-};
-module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
 /*
 * kdb_cpu - This function implements the 'cpu' command.
 *      cpu     [<cpunum>]
@@ -2880,10 +2886,6 @@ static void __init kdb_inittab(void)
        kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
          "Display syslog buffer", 0, KDB_REPEAT_NONE);
 #endif
-        if (arch_kgdb_ops.enable_nmi) {
-                kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
-                  "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
-        }
        kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
          "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
        kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 392ec6a2584..e381d105b40 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -205,6 +205,7 @@ extern char kdb_grep_string[];
 extern int kdb_grep_leading;
 extern int kdb_grep_trailing;
 extern char *kdb_cmds[];
+extern void kdb_syslog_data(char *syslog_data[]);
 extern unsigned long kdb_task_state_string(const char *);
 extern char kdb_task_state_char (const struct task_struct *);
 extern unsigned long kdb_task_state(const struct task_struct *p,
@@ -245,13 +246,6 @@ extern void debug_kusage(void);
 extern void kdb_set_current_task(struct task_struct *);
 extern struct task_struct *kdb_current_task;
-#ifdef CONFIG_KDB_KEYBOARD
-extern void kdb_kbd_cleanup_state(void);
-#else /* ! CONFIG_KDB_KEYBOARD */
-#define kdb_kbd_cleanup_state()
-#endif /* ! CONFIG_KDB_KEYBOARD */
 #ifdef CONFIG_MODULES
 extern struct list_head *kdb_modules;
 #endif /* CONFIG_MODULES */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index d35cc2d3a4c..5532dd37aa8 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
        if (!pfn_valid(pfn))
                return 1;
        page = pfn_to_page(pfn);
-        vaddr = kmap_atomic(page);
+        vaddr = kmap_atomic(page, KM_KDB);
        memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
-        kunmap_atomic(vaddr);
+        kunmap_atomic(vaddr, KM_KDB);
        return 0;
 }
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p)
                (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
                (p->exit_state & EXIT_DEAD) ? 'E' :
                (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
-        if (is_idle_task(p)) {
+        if (p->pid == 0) {
                /* Idle task.  Is it really idle, apart from the kdb
                 * interrupt? */
                if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
diff --git a/kernel/dma.c b/kernel/dma.c
index 6c6262f86c1..f903189c530 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -9,7 +9,7 @@
 *   [It also happened to remove the sizeof(char *) == sizeof(int)
 *   assumption introduced because of those /proc/dma patches. -- Hennus]
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/spinlock.h>
@@ -18,6 +18,7 @@
 #include <linux/proc_fs.h>
 #include <linux/init.h>
 #include <asm/dma.h>
+#include <asm/system.h>
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 103f5d147b2..89e5e8aa4c3 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,8 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_core.o = -pg
 endif
-obj-y := core.o ring_buffer.o callchain.o
+obj-y := core.o ring_buffer.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
-obj-$(CONFIG_UPROBES) += uprobes.o
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
deleted file mode 100644
index c77206184b8..00000000000
--- a/kernel/events/callchain.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Performance events callchain code, extracted from core.c:
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
- */
-#include <linux/perf_event.h>
-#include <linux/slab.h>
-#include "internal.h"
-struct callchain_cpus_entries {
-        struct rcu_head                 rcu_head;
-        struct perf_callchain_entry     *cpu_entries[0];
-};
-static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
-static atomic_t nr_callchain_events;
-static DEFINE_MUTEX(callchain_mutex);
-static struct callchain_cpus_entries *callchain_cpus_entries;
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
-                                  struct pt_regs *regs)
-{
-}
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
-                                struct pt_regs *regs)
-{
-}
-static void release_callchain_buffers_rcu(struct rcu_head *head)
-{
-        struct callchain_cpus_entries *entries;
-        int cpu;
-        entries = container_of(head, struct callchain_cpus_entries, rcu_head);
-        for_each_possible_cpu(cpu)
-                kfree(entries->cpu_entries[cpu]);
-        kfree(entries);
-}
-static void release_callchain_buffers(void)
-{
-        struct callchain_cpus_entries *entries;
-        entries = callchain_cpus_entries;
-        rcu_assign_pointer(callchain_cpus_entries, NULL);
-        call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
-}
-static int alloc_callchain_buffers(void)
-{
-        int cpu;
-        int size;
-        struct callchain_cpus_entries *entries;
-        /*
-         * We can't use the percpu allocation API for data that can be
-         * accessed from NMI. Use a temporary manual per cpu allocation
-         * until that gets sorted out.
-         */
-        size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
-        entries = kzalloc(size, GFP_KERNEL);
-        if (!entries)
-                return -ENOMEM;
-        size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
-        for_each_possible_cpu(cpu) {
-                entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
-                                                         cpu_to_node(cpu));
-                if (!entries->cpu_entries[cpu])
-                        goto fail;
-        }
-        rcu_assign_pointer(callchain_cpus_entries, entries);
-        return 0;
-fail:
-        for_each_possible_cpu(cpu)
-                kfree(entries->cpu_entries[cpu]);
-        kfree(entries);
-        return -ENOMEM;
-}
-int get_callchain_buffers(void)
-{
-        int err = 0;
-        int count;
-        mutex_lock(&callchain_mutex);
-        count = atomic_inc_return(&nr_callchain_events);
-        if (WARN_ON_ONCE(count < 1)) {
-                err = -EINVAL;
-                goto exit;
-        }
-        if (count > 1) {
-                /* If the allocation failed, give up */
-                if (!callchain_cpus_entries)
-                        err = -ENOMEM;
-                goto exit;
-        }
-        err = alloc_callchain_buffers();
-exit:
-        mutex_unlock(&callchain_mutex);
-        return err;
-}
-void put_callchain_buffers(void)
-{
-        if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
-                release_callchain_buffers();
-                mutex_unlock(&callchain_mutex);
-        }
-}
-static struct perf_callchain_entry *get_callchain_entry(int *rctx)
-{
-        int cpu;
-        struct callchain_cpus_entries *entries;
-        *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
-        if (*rctx == -1)
-                return NULL;
-        entries = rcu_dereference(callchain_cpus_entries);
-        if (!entries)
-                return NULL;
-        cpu = smp_processor_id();
-        return &entries->cpu_entries[cpu][*rctx];
-}
-static void
-put_callchain_entry(int rctx)
-{
-        put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
-}
-struct perf_callchain_entry *
-perf_callchain(struct perf_event *event, struct pt_regs *regs)
-{
-        int rctx;
-        struct perf_callchain_entry *entry;
-        int kernel = !event->attr.exclude_callchain_kernel;
-        int user   = !event->attr.exclude_callchain_user;
-        if (!kernel && !user)
-                return NULL;
-        entry = get_callchain_entry(&rctx);
-        if (rctx == -1)
-                return NULL;
-        if (!entry)
-                goto exit_put;
-        entry->nr = 0;
-        if (kernel && !user_mode(regs)) {
-                perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-                perf_callchain_kernel(entry, regs);
-        }
-        if (user) {
-                if (!user_mode(regs)) {
-                        if  (current->mm)
-                                regs = task_pt_regs(current);
-                        else
-                                regs = NULL;
-                }
-                if (regs) {
-                        /*
-                         * Disallow cross-task user callchains.
-                         */
-                        if (event->ctx->task && event->ctx->task != current)
-                                goto exit_put;
-                        perf_callchain_store(entry, PERF_CONTEXT_USER);
-                        perf_callchain_user(entry, regs);
-                }
-        }
-exit_put:
-        put_callchain_entry(rctx);
-        return entry;
-}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 301079d06f2..0f857782d06 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4,7 +4,7 @@
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright  ��  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 *
 * For licensing details see kernel-base/COPYING
 */
@@ -25,7 +25,6 @@
 #include <linux/reboot.h>
 #include <linux/vmstat.h>
 #include <linux/device.h>
-#include <linux/export.h>
 #include <linux/vmalloc.h>
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
@@ -36,7 +35,6 @@
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
 #include <linux/hw_breakpoint.h>
-#include <linux/mm_types.h>
 #include "internal.h"
@@ -119,13 +117,6 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP)
-/*
- * branch priv levels that need permission checks
- */
-#define PERF_SAMPLE_BRANCH_PERM_PLM \
-        (PERF_SAMPLE_BRANCH_KERNEL |\
-         PERF_SAMPLE_BRANCH_HV)
 enum event_type_t {
        EVENT_FLEXIBLE = 0x1,
        EVENT_PINNED = 0x2,
@@ -136,9 +127,8 @@ enum event_type_t {
 * perf_sched_events : >0 events exist
 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 */
-struct static_key_deferred perf_sched_events __read_mostly;
+struct jump_label_key perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -194,9 +184,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
-static void ring_buffer_attach(struct perf_event *event,
-                               struct ring_buffer *rb);
 void __weak perf_event_print_debug(void)        { }
 extern __weak const char *perf_pmu_name(void)
@@ -254,9 +241,9 @@ perf_cgroup_match(struct perf_event *event)
        return !event->cgrp || event->cgrp == cpuctx->cgrp;
 }
-static inline bool perf_tryget_cgroup(struct perf_event *event)
+static inline void perf_get_cgroup(struct perf_event *event)
 {
-        return css_tryget(&event->cgrp->css);
+        css_get(&event->cgrp->css);
 }
 static inline void perf_put_cgroup(struct perf_event *event)
@@ -372,8 +359,6 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-                if (cpuctx->unique_pmu != pmu)
-                        continue; /* ensure we process each cpuctx once */
                /*
                 * perf_cgroup_events says at least one
@@ -397,10 +382,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
                        if (mode & PERF_CGROUP_SWIN) {
                                WARN_ON_ONCE(cpuctx->cgrp);
-                                /*
+                                /* set cgrp before ctxsw in to
-                                 * set cgrp before ctxsw in to allow
+                                 * allow event_filter_match() to not
-                                 * event_filter_match() to not have to pass
+                                 * have to pass task around
-                                 * task around
                                 */
                                cpuctx->cgrp = perf_cgroup_from_task(task);
                                cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
@@ -471,13 +455,14 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 {
        struct perf_cgroup *cgrp;
        struct cgroup_subsys_state *css;
-        struct fd f = fdget(fd);
+        struct file *file;
-        int ret = 0;
+        int ret = 0, fput_needed;
-        if (!f.file)
+        file = fget_light(fd, &fput_needed);
+        if (!file)
                return -EBADF;
-        css = cgroup_css_from_dir(f.file, perf_subsys_id);
+        css = cgroup_css_from_dir(file, perf_subsys_id);
        if (IS_ERR(css)) {
                ret = PTR_ERR(css);
                goto out;
@@ -487,11 +472,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
        event->cgrp = cgrp;
        /* must be done before we fput() the file */
-        if (!perf_tryget_cgroup(event)) {
+        perf_get_cgroup(event);
-                event->cgrp = NULL;
-                ret = -ENOENT;
-                goto out;
-        }
        /*
         * all events in a group must monitor
@@ -503,7 +484,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
                ret = -EINVAL;
        }
 out:
-        fdput(f);
+        fput_light(file, fput_needed);
        return ret;
 }
@@ -830,7 +811,7 @@ static void update_event_times(struct perf_event *event)
         * here.
         */
        if (is_cgroup_event(event))
-                run_end = perf_cgroup_event_time(event);
+                run_end = perf_event_time(event);
        else if (ctx->is_active)
                run_end = ctx->time;
        else
@@ -896,9 +877,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
        if (is_cgroup_event(event))
                ctx->nr_cgroups++;
-        if (has_branch_stack(event))
-                ctx->nr_branch_stack++;
        list_add_rcu(&event->event_entry, &ctx->event_list);
        if (!ctx->nr_events)
                perf_pmu_rotate_start(ctx->pmu);
@@ -1038,9 +1016,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
                        cpuctx->cgrp = NULL;
        }
-        if (has_branch_stack(event))
-                ctx->nr_branch_stack--;
        ctx->nr_events--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;
@@ -1151,8 +1126,6 @@ event_sched_out(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu--;
        ctx->nr_active--;
-        if (event->attr.freq && event->attr.sample_freq)
-                ctx->nr_freq--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;
 }
@@ -1256,7 +1229,7 @@ retry:
 /*
 * Cross CPU call to disable a performance event
 */
-int __perf_event_disable(void *info)
+static int __perf_event_disable(void *info)
 {
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
@@ -1348,7 +1321,6 @@ retry:
        }
        raw_spin_unlock_irq(&ctx->lock);
 }
-EXPORT_SYMBOL_GPL(perf_event_disable);
 static void perf_set_shadow_time(struct perf_event *event,
                                 struct perf_event_context *ctx,
@@ -1430,8 +1402,6 @@ event_sched_in(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
        ctx->nr_active++;
-        if (event->attr.freq && event->attr.sample_freq)
-                ctx->nr_freq++;
        if (event->attr.exclusive)
                cpuctx->exclusive = 1;
@@ -1648,8 +1618,6 @@ perf_install_in_context(struct perf_event_context *ctx,
        lockdep_assert_held(&ctx->mutex);
        event->ctx = ctx;
-        if (event->cpu != -1)
-                event->cpu = cpu;
        if (!task) {
                /*
@@ -1690,7 +1658,8 @@ retry:
 * Note: this works for group members as well as group leaders
 * since the non-leader members' sibling_lists will be empty.
 */
-static void __perf_event_mark_enabled(struct perf_event *event)
+static void __perf_event_mark_enabled(struct perf_event *event,
+                                        struct perf_event_context *ctx)
 {
        struct perf_event *sub;
        u64 tstamp = perf_event_time(event);
@@ -1728,7 +1697,7 @@ static int __perf_event_enable(void *info)
         */
        perf_cgroup_set_timestamp(current, ctx);
-        __perf_event_mark_enabled(event);
+        __perf_event_mark_enabled(event, ctx);
        if (!event_filter_match(event)) {
                if (is_cgroup_event(event))
@@ -1809,7 +1778,7 @@ void perf_event_enable(struct perf_event *event)
 retry:
        if (!ctx->is_active) {
-                __perf_event_mark_enabled(event);
+                __perf_event_mark_enabled(event, ctx);
                goto out;
        }
@@ -1836,7 +1805,6 @@ retry:
 out:
        raw_spin_unlock_irq(&ctx->lock);
 }
-EXPORT_SYMBOL_GPL(perf_event_enable);
 int perf_event_refresh(struct perf_event *event, int refresh)
 {
@@ -2202,10 +2170,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
         */
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-        if (ctx->nr_events)
+        perf_event_sched_in(cpuctx, ctx, task);
-                cpuctx->task_ctx = ctx;
-        perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
+        cpuctx->task_ctx = ctx;
        perf_pmu_enable(ctx->pmu);
        perf_ctx_unlock(cpuctx, ctx);
@@ -2218,66 +2185,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 }
 /*
- * When sampling the branck stack in system-wide, it may be necessary
- * to flush the stack on context switch. This happens when the branch
- * stack does not tag its entries with the pid of the current task.
- * Otherwise it becomes impossible to associate a branch entry with a
- * task. This ambiguity is more likely to appear when the branch stack
- * supports priv level filtering and the user sets it to monitor only
- * at the user level (which could be a useful measurement in system-wide
- * mode). In that case, the risk is high of having a branch stack with
- * branch from multiple tasks. Flushing may mean dropping the existing
- * entries or stashing them somewhere in the PMU specific code layer.
- *
- * This function provides the context switch callback to the lower code
- * layer. It is invoked ONLY when there is at least one system-wide context
- * with at least one active event using taken branch sampling.
- */
-static void perf_branch_stack_sched_in(struct task_struct *prev,
-                                       struct task_struct *task)
-{
-        struct perf_cpu_context *cpuctx;
-        struct pmu *pmu;
-        unsigned long flags;
-        /* no need to flush branch stack if not changing task */
-        if (prev == task)
-                return;
-        local_irq_save(flags);
-        rcu_read_lock();
-        list_for_each_entry_rcu(pmu, &pmus, entry) {
-                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-                /*
-                 * check if the context has at least one
-                 * event using PERF_SAMPLE_BRANCH_STACK
-                 */
-                if (cpuctx->ctx.nr_branch_stack > 0
-                    && pmu->flush_branch_stack) {
-                        pmu = cpuctx->ctx.pmu;
-                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-                        perf_pmu_disable(pmu);
-                        pmu->flush_branch_stack();
-                        perf_pmu_enable(pmu);
-                        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-                }
-        }
-        rcu_read_unlock();
-        local_irq_restore(flags);
-}
-/*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
 *
@@ -2308,10 +2215,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
         */
        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
                perf_cgroup_sched_in(prev, task);
-        /* check for system-wide branch_stack events */
-        if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
-                perf_branch_stack_sched_in(prev, task);
 }
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2387,10 +2290,7 @@ do {					\
        return div64_u64(dividend, divisor);
 }
-static DEFINE_PER_CPU(int, perf_throttled_count);
+static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
-static DEFINE_PER_CPU(u64, perf_throttled_seq);
-static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
 {
        struct hw_perf_event *hwc = &event->hw;
        s64 period, sample_period;
@@ -2409,40 +2309,19 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
        hwc->sample_period = sample_period;
        if (local64_read(&hwc->period_left) > 8*sample_period) {
-                if (disable)
+                event->pmu->stop(event, PERF_EF_UPDATE);
-                        event->pmu->stop(event, PERF_EF_UPDATE);
                local64_set(&hwc->period_left, 0);
+                event->pmu->start(event, PERF_EF_RELOAD);
-                if (disable)
-                        event->pmu->start(event, PERF_EF_RELOAD);
        }
 }
-/*
+static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
- * combine freq adjustment with unthrottling to avoid two passes over the
- * events. At the same time, make sure, having freq events does not change
- * the rate of unthrottling as that would introduce bias.
- */
-static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
-                                           int needs_unthr)
 {
        struct perf_event *event;
        struct hw_perf_event *hwc;
-        u64 now, period = TICK_NSEC;
+        u64 interrupts, now;
        s64 delta;
-        /*
-         * only need to iterate over all events iff:
-         * - context have events in frequency mode (needs freq adjust)
-         * - there are events to unthrottle on this cpu
-         */
-        if (!(ctx->nr_freq || needs_unthr))
-                return;
-        raw_spin_lock(&ctx->lock);
-        perf_pmu_disable(ctx->pmu);
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
@@ -2452,8 +2331,13 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                hwc = &event->hw;
-                if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
+                interrupts = hwc->interrupts;
-                        hwc->interrupts = 0;
+                hwc->interrupts = 0;
+                /*
+                 * unthrottle events on the tick
+                 */
+                if (interrupts == MAX_INTERRUPTS) {
                        perf_log_throttle(event, 1);
                        event->pmu->start(event, 0);
                }
@@ -2461,30 +2345,14 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;
-                /*
+                event->pmu->read(event);
-                 * stop the event and update event->count
-                 */
-                event->pmu->stop(event, PERF_EF_UPDATE);
                now = local64_read(&event->count);
                delta = now - hwc->freq_count_stamp;
                hwc->freq_count_stamp = now;
-                /*
-                 * restart the event
-                 * reload only if value has changed
-                 * we have stopped the event so tell that
-                 * to perf_adjust_period() to avoid stopping it
-                 * twice.
-                 */
                if (delta > 0)
-                        perf_adjust_period(event, period, delta, false);
+                        perf_adjust_period(event, period, delta);
-                event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
        }
-        perf_pmu_enable(ctx->pmu);
-        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -2507,6 +2375,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
 */
 static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
+        u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
        struct perf_event_context *ctx = NULL;
        int rotate = 0, remove = 1;
@@ -2523,11 +2392,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
                        rotate = 1;
        }
-        if (!rotate)
-                goto done;
        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(cpuctx->ctx.pmu);
+        perf_ctx_adjust_freq(&cpuctx->ctx, interval);
+        if (ctx)
+                perf_ctx_adjust_freq(ctx, interval);
+        if (!rotate)
+                goto done;
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
@@ -2539,33 +2411,22 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
        perf_event_sched_in(cpuctx, ctx, current);
-        perf_pmu_enable(cpuctx->ctx.pmu);
-        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 done:
        if (remove)
                list_del_init(&cpuctx->rotation_list);
+        perf_pmu_enable(cpuctx->ctx.pmu);
+        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 void perf_event_task_tick(void)
 {
        struct list_head *head = &__get_cpu_var(rotation_list);
        struct perf_cpu_context *cpuctx, *tmp;
-        struct perf_event_context *ctx;
-        int throttled;
        WARN_ON(!irqs_disabled());
-        __this_cpu_inc(perf_throttled_seq);
-        throttled = __this_cpu_xchg(perf_throttled_count, 0);
        list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
-                ctx = &cpuctx->ctx;
-                perf_adjust_freq_unthr_context(ctx, throttled);
-                ctx = cpuctx->task_ctx;
-                if (ctx)
-                        perf_adjust_freq_unthr_context(ctx, throttled);
                if (cpuctx->jiffies_interval == 1 ||
                                !(jiffies % cpuctx->jiffies_interval))
                        perf_rotate_context(cpuctx);
@@ -2582,7 +2443,7 @@ static int event_enable_on_exec(struct perf_event *event,
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                return 0;
-        __perf_event_mark_enabled(event);
+        __perf_event_mark_enabled(event, ctx);
        return 1;
 }
@@ -2614,7 +2475,13 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
        raw_spin_lock(&ctx->lock);
        task_ctx_sched_out(ctx);
-        list_for_each_entry(event, &ctx->event_list, event_entry) {
+        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+                ret = event_enable_on_exec(event, ctx);
+                if (ret)
+                        enabled = 1;
+        }
+        list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
                ret = event_enable_on_exec(event, ctx);
                if (ret)
                        enabled = 1;
@@ -2702,6 +2569,215 @@ static u64 perf_event_read(struct perf_event *event)
 }
 /*
+ * Callchain support
+ */
+struct callchain_cpus_entries {
+        struct rcu_head                 rcu_head;
+        struct perf_callchain_entry     *cpu_entries[0];
+};
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+struct callchain_cpus_entries *callchain_cpus_entries;
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
+{
+}
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+                                struct pt_regs *regs)
+{
+}
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+        struct callchain_cpus_entries *entries;
+        int cpu;
+        entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+        for_each_possible_cpu(cpu)
+                kfree(entries->cpu_entries[cpu]);
+        kfree(entries);
+}
+static void release_callchain_buffers(void)
+{
+        struct callchain_cpus_entries *entries;
+        entries = callchain_cpus_entries;
+        rcu_assign_pointer(callchain_cpus_entries, NULL);
+        call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+static int alloc_callchain_buffers(void)
+{
+        int cpu;
+        int size;
+        struct callchain_cpus_entries *entries;
+        /*
+         * We can't use the percpu allocation API for data that can be
+         * accessed from NMI. Use a temporary manual per cpu allocation
+         * until that gets sorted out.
+         */
+        size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
+        entries = kzalloc(size, GFP_KERNEL);
+        if (!entries)
+                return -ENOMEM;
+        size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+        for_each_possible_cpu(cpu) {
+                entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+                                                         cpu_to_node(cpu));
+                if (!entries->cpu_entries[cpu])
+                        goto fail;
+        }
+        rcu_assign_pointer(callchain_cpus_entries, entries);
+        return 0;
+fail:
+        for_each_possible_cpu(cpu)
+                kfree(entries->cpu_entries[cpu]);
+        kfree(entries);
+        return -ENOMEM;
+}
+static int get_callchain_buffers(void)
+{
+        int err = 0;
+        int count;
+        mutex_lock(&callchain_mutex);
+        count = atomic_inc_return(&nr_callchain_events);
+        if (WARN_ON_ONCE(count < 1)) {
+                err = -EINVAL;
+                goto exit;
+        }
+        if (count > 1) {
+                /* If the allocation failed, give up */
+                if (!callchain_cpus_entries)
+                        err = -ENOMEM;
+                goto exit;
+        }
+        err = alloc_callchain_buffers();
+        if (err)
+                release_callchain_buffers();
+exit:
+        mutex_unlock(&callchain_mutex);
+        return err;
+}
+static void put_callchain_buffers(void)
+{
+        if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+                release_callchain_buffers();
+                mutex_unlock(&callchain_mutex);
+        }
+}
+static int get_recursion_context(int *recursion)
+{
+        int rctx;
+        if (in_nmi())
+                rctx = 3;
+        else if (in_irq())
+                rctx = 2;
+        else if (in_softirq())
+                rctx = 1;
+        else
+                rctx = 0;
+        if (recursion[rctx])
+                return -1;
+        recursion[rctx]++;
+        barrier();
+        return rctx;
+}
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+        barrier();
+        recursion[rctx]--;
+}
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+        int cpu;
+        struct callchain_cpus_entries *entries;
+        *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+        if (*rctx == -1)
+                return NULL;
+        entries = rcu_dereference(callchain_cpus_entries);
+        if (!entries)
+                return NULL;
+        cpu = smp_processor_id();
+        return &entries->cpu_entries[cpu][*rctx];
+}
+static void
+put_callchain_entry(int rctx)
+{
+        put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+        int rctx;
+        struct perf_callchain_entry *entry;
+        entry = get_callchain_entry(&rctx);
+        if (rctx == -1)
+                return NULL;
+        if (!entry)
+                goto exit_put;
+        entry->nr = 0;
+        if (!user_mode(regs)) {
+                perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+                perf_callchain_kernel(entry, regs);
+                if (current->mm)
+                        regs = task_pt_regs(current);
+                else
+                        regs = NULL;
+        }
+        if (regs) {
+                perf_callchain_store(entry, PERF_CONTEXT_USER);
+                perf_callchain_user(entry, regs);
+        }
+exit_put:
+        put_callchain_entry(rctx);
+        return entry;
+}
+/*
 * Initialize the perf_event context in a task_struct:
 */
 static void __perf_event_init_context(struct perf_event_context *ctx)
@@ -2865,7 +2941,7 @@ static void free_event(struct perf_event *event)
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                        static_key_slow_dec_deferred(&perf_sched_events);
+                        jump_label_dec(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_dec(&nr_mmap_events);
                if (event->attr.comm)
@@ -2876,15 +2952,7 @@ static void free_event(struct perf_event *event)
                        put_callchain_buffers();
                if (is_cgroup_event(event)) {
                        atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
-                        static_key_slow_dec_deferred(&perf_sched_events);
+                        jump_label_dec(&perf_sched_events);
-                }
-                if (has_branch_stack(event)) {
-                        static_key_slow_dec_deferred(&perf_sched_events);
-                        /* is system-wide event */
-                        if (!(event->attach_state & PERF_ATTACH_TASK))
-                                atomic_dec(&per_cpu(perf_branch_stack_events,
-                                                    event->cpu));
                }
        }
@@ -2938,12 +3006,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 /*
 * Called when the last reference to the file is gone.
 */
-static void put_event(struct perf_event *event)
+static int perf_release(struct inode *inode, struct file *file)
 {
+        struct perf_event *event = file->private_data;
        struct task_struct *owner;
-        if (!atomic_long_dec_and_test(&event->refcount))
+        file->private_data = NULL;
-                return;
        rcu_read_lock();
        owner = ACCESS_ONCE(event->owner);
@@ -2978,13 +3046,7 @@ static void put_event(struct perf_event *event)
                put_task_struct(owner);
        }
-        perf_event_release_kernel(event);
+        return perf_event_release_kernel(event);
-}
-static int perf_release(struct inode *inode, struct file *file)
-{
-        put_event(file->private_data);
-        return 0;
 }
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -3127,33 +3189,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
        struct ring_buffer *rb;
        unsigned int events = POLL_HUP;
-        /*
-         * Race between perf_event_set_output() and perf_poll(): perf_poll()
-         * grabs the rb reference but perf_event_set_output() overrides it.
-         * Here is the timeline for two threads T1, T2:
-         * t0: T1, rb = rcu_dereference(event->rb)
-         * t1: T2, old_rb = event->rb
-         * t2: T2, event->rb = new rb
-         * t3: T2, ring_buffer_detach(old_rb)
-         * t4: T1, ring_buffer_attach(rb1)
-         * t5: T1, poll_wait(event->waitq)
-         *
-         * To avoid this problem, we grab mmap_mutex in perf_poll()
-         * thereby ensuring that the assignment of the new ring buffer
-         * and the detachment of the old buffer appear atomic to perf_poll()
-         */
-        mutex_lock(&event->mmap_mutex);
        rcu_read_lock();
        rb = rcu_dereference(event->rb);
-        if (rb) {
+        if (rb)
-                ring_buffer_attach(event, rb);
                events = atomic_xchg(&rb->poll, 0);
-        }
        rcu_read_unlock();
-        mutex_unlock(&event->mmap_mutex);
        poll_wait(file, &event->waitq, wait);
        return events;
@@ -3196,8 +3237,9 @@ static void perf_event_for_each(struct perf_event *event,
        event = event->group_leader;
        perf_event_for_each_child(event, func);
+        func(event);
        list_for_each_entry(sibling, &event->sibling_list, group_entry)
-                perf_event_for_each_child(sibling, func);
+                perf_event_for_each_child(event, func);
        mutex_unlock(&ctx->mutex);
 }
@@ -3236,18 +3278,21 @@ unlock:
 static const struct file_operations perf_fops;
-static inline int perf_fget_light(int fd, struct fd *p)
+static struct perf_event *perf_fget_light(int fd, int *fput_needed)
 {
-        struct fd f = fdget(fd);
+        struct file *file;
-        if (!f.file)
-                return -EBADF;
-        if (f.file->f_op != &perf_fops) {
+        file = fget_light(fd, fput_needed);
-                fdput(f);
+        if (!file)
-                return -EBADF;
+                return ERR_PTR(-EBADF);
+        if (file->f_op != &perf_fops) {
+                fput_light(file, *fput_needed);
+                *fput_needed = 0;
+                return ERR_PTR(-EBADF);
        }
-        *p = f;
-        return 0;
+        return file->private_data;
 }
 static int perf_event_set_output(struct perf_event *event,
@@ -3279,19 +3324,20 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case PERF_EVENT_IOC_SET_OUTPUT:
        {
+                struct perf_event *output_event = NULL;
+                int fput_needed = 0;
                int ret;
                if (arg != -1) {
-                        struct perf_event *output_event;
+                        output_event = perf_fget_light(arg, &fput_needed);
-                        struct fd output;
+                        if (IS_ERR(output_event))
-                        ret = perf_fget_light(arg, &output);
+                                return PTR_ERR(output_event);
-                        if (ret)
-                                return ret;
-                        output_event = output.file->private_data;
-                        ret = perf_event_set_output(event, output_event);
-                        fdput(output);
-                } else {
-                        ret = perf_event_set_output(event, NULL);
                }
+                ret = perf_event_set_output(event, output_event);
+                if (output_event)
+                        fput_light(output_event->filp, fput_needed);
                return ret;
        }
@@ -3334,6 +3380,10 @@ int perf_event_task_disable(void)
        return 0;
 }
+#ifndef PERF_EVENT_INDEX_OFFSET
+# define PERF_EVENT_INDEX_OFFSET 0
+#endif
 static int perf_event_index(struct perf_event *event)
 {
        if (event->hw.state & PERF_HES_STOPPED)
@@ -3342,26 +3392,21 @@ static int perf_event_index(struct perf_event *event)
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;
-        return event->pmu->event_idx(event);
+        return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
 }
 static void calc_timer_values(struct perf_event *event,
-                                u64 *now,
                                u64 *enabled,
                                u64 *running)
 {
-        u64 ctx_time;
+        u64 now, ctx_time;
-        *now = perf_clock();
+        now = perf_clock();
-        ctx_time = event->shadow_ctx_time + *now;
+        ctx_time = event->shadow_ctx_time + now;
        *enabled = ctx_time - event->tstamp_enabled;
        *running = ctx_time - event->tstamp_running;
 }
-void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
-{
-}
 /*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3371,7 +3416,7 @@ void perf_event_update_userpage(struct perf_event *event)
 {
        struct perf_event_mmap_page *userpg;
        struct ring_buffer *rb;
-        u64 enabled, running, now;
+        u64 enabled, running;
        rcu_read_lock();
        /*
@@ -3383,7 +3428,7 @@ void perf_event_update_userpage(struct perf_event *event)
         * because of locking issue as we can be called in
         * NMI context
         */
-        calc_timer_values(event, &now, &enabled, &running);
+        calc_timer_values(event, &enabled, &running);
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;
@@ -3399,7 +3444,7 @@ void perf_event_update_userpage(struct perf_event *event)
        barrier();
        userpg->index = perf_event_index(event);
        userpg->offset = perf_event_count(event);
-        if (userpg->index)
+        if (event->state == PERF_EVENT_STATE_ACTIVE)
                userpg->offset -= local64_read(&event->hw.prev_count);
        userpg->time_enabled = enabled +
@@ -3408,8 +3453,6 @@ void perf_event_update_userpage(struct perf_event *event)
        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);
-        arch_perf_update_userpage(userpg, now);
        barrier();
        ++userpg->lock;
        preempt_enable();
@@ -3452,53 +3495,6 @@ unlock:
        return ret;
 }
-static void ring_buffer_attach(struct perf_event *event,
-                               struct ring_buffer *rb)
-{
-        unsigned long flags;
-        if (!list_empty(&event->rb_entry))
-                return;
-        spin_lock_irqsave(&rb->event_lock, flags);
-        if (!list_empty(&event->rb_entry))
-                goto unlock;
-        list_add(&event->rb_entry, &rb->event_list);
-unlock:
-        spin_unlock_irqrestore(&rb->event_lock, flags);
-}
-static void ring_buffer_detach(struct perf_event *event,
-                               struct ring_buffer *rb)
-{
-        unsigned long flags;
-        if (list_empty(&event->rb_entry))
-                return;
-        spin_lock_irqsave(&rb->event_lock, flags);
-        list_del_init(&event->rb_entry);
-        wake_up_all(&event->waitq);
-        spin_unlock_irqrestore(&rb->event_lock, flags);
-}
-static void ring_buffer_wakeup(struct perf_event *event)
-{
-        struct ring_buffer *rb;
-        rcu_read_lock();
-        rb = rcu_dereference(event->rb);
-        if (!rb)
-                goto unlock;
-        list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
-                wake_up_all(&event->waitq);
-unlock:
-        rcu_read_unlock();
-}
 static void rb_free_rcu(struct rcu_head *rcu_head)
 {
        struct ring_buffer *rb;
@@ -3524,19 +3520,9 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 static void ring_buffer_put(struct ring_buffer *rb)
 {
-        struct perf_event *event, *n;
-        unsigned long flags;
        if (!atomic_dec_and_test(&rb->refcount))
                return;
-        spin_lock_irqsave(&rb->event_lock, flags);
-        list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
-                list_del_init(&event->rb_entry);
-                wake_up_all(&event->waitq);
-        }
-        spin_unlock_irqrestore(&rb->event_lock, flags);
        call_rcu(&rb->rcu_head, rb_free_rcu);
 }
@@ -3557,9 +3543,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
                struct ring_buffer *rb = event->rb;
                atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-                vma->vm_mm->pinned_vm -= event->mmap_locked;
+                vma->vm_mm->locked_vm -= event->mmap_locked;
                rcu_assign_pointer(event->rb, NULL);
-                ring_buffer_detach(event, rb);
                mutex_unlock(&event->mmap_mutex);
                ring_buffer_put(rb);
@@ -3639,7 +3624,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
-        locked = vma->vm_mm->pinned_vm + extra;
+        locked = vma->vm_mm->locked_vm + extra;
        if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
                !capable(CAP_IPC_LOCK)) {
@@ -3665,16 +3650,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        atomic_long_add(user_extra, &user->locked_vm);
        event->mmap_locked = extra;
        event->mmap_user = get_current_user();
-        vma->vm_mm->pinned_vm += event->mmap_locked;
+        vma->vm_mm->locked_vm += event->mmap_locked;
-        perf_event_update_userpage(event);
 unlock:
        if (!ret)
                atomic_inc(&event->mmap_count);
        mutex_unlock(&event->mmap_mutex);
-        vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+        vma->vm_flags |= VM_RESERVED;
        vma->vm_ops = &perf_mmap_vmops;
        return ret;
@@ -3716,7 +3699,7 @@ static const struct file_operations perf_fops = {
 void perf_event_wakeup(struct perf_event *event)
 {
-        ring_buffer_wakeup(event);
+        wake_up_all(&event->waitq);
        if (event->pending_kill) {
                kill_fasync(&event->fasync, SIGIO, event->pending_kill);
@@ -3761,132 +3744,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 }
 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
-static void
-perf_output_sample_regs(struct perf_output_handle *handle,
-                        struct pt_regs *regs, u64 mask)
-{
-        int bit;
-        for_each_set_bit(bit, (const unsigned long *) &mask,
-                         sizeof(mask) * BITS_PER_BYTE) {
-                u64 val;
-                val = perf_reg_value(regs, bit);
-                perf_output_put(handle, val);
-        }
-}
-static void perf_sample_regs_user(struct perf_regs_user *regs_user,
-                                  struct pt_regs *regs)
-{
-        if (!user_mode(regs)) {
-                if (current->mm)
-                        regs = task_pt_regs(current);
-                else
-                        regs = NULL;
-        }
-        if (regs) {
-                regs_user->regs = regs;
-                regs_user->abi  = perf_reg_abi(current);
-        }
-}
-/*
- * Get remaining task size from user stack pointer.
- *
- * It'd be better to take stack vma map and limit this more
- * precisly, but there's no way to get it safely under interrupt,
- * so using TASK_SIZE as limit.
- */
-static u64 perf_ustack_task_size(struct pt_regs *regs)
-{
-        unsigned long addr = perf_user_stack_pointer(regs);
-        if (!addr || addr >= TASK_SIZE)
-                return 0;
-        return TASK_SIZE - addr;
-}
-static u16
-perf_sample_ustack_size(u16 stack_size, u16 header_size,
-                        struct pt_regs *regs)
-{
-        u64 task_size;
-        /* No regs, no stack pointer, no dump. */
-        if (!regs)
-                return 0;
-        /*
-         * Check if we fit in with the requested stack size into the:
-         * - TASK_SIZE
-         *   If we don't, we limit the size to the TASK_SIZE.
-         *
-         * - remaining sample size
-         *   If we don't, we customize the stack size to
-         *   fit in to the remaining sample size.
-         */
-        task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
-        stack_size = min(stack_size, (u16) task_size);
-        /* Current header size plus static size and dynamic size. */
-        header_size += 2 * sizeof(u64);
-        /* Do we fit in with the current stack dump size? */
-        if ((u16) (header_size + stack_size) < header_size) {
-                /*
-                 * If we overflow the maximum size for the sample,
-                 * we customize the stack dump size to fit in.
-                 */
-                stack_size = USHRT_MAX - header_size - sizeof(u64);
-                stack_size = round_up(stack_size, sizeof(u64));
-        }
-        return stack_size;
-}
-static void
-perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
-                          struct pt_regs *regs)
-{
-        /* Case of a kernel thread, nothing to dump */
-        if (!regs) {
-                u64 size = 0;
-                perf_output_put(handle, size);
-        } else {
-                unsigned long sp;
-                unsigned int rem;
-                u64 dyn_size;
-                /*
-                 * We dump:
-                 * static size
-                 *   - the size requested by user or the best one we can fit
-                 *     in to the sample max size
-                 * data
-                 *   - user stack dump data
-                 * dynamic size
-                 *   - the actual dumped size
-                 */
-                /* Static size. */
-                perf_output_put(handle, dump_size);
-                /* Data. */
-                sp = perf_user_stack_pointer(regs);
-                rem = __output_copy_user(handle, (void *) sp, dump_size);
-                dyn_size = dump_size - rem;
-                perf_output_skip(handle, rem);
-                /* Dynamic size. */
-                perf_output_put(handle, dyn_size);
-        }
-}
 static void __perf_event_header__init_id(struct perf_event_header *header,
                                         struct perf_sample_data *data,
                                         struct perf_event *event)
@@ -4026,7 +3883,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
 {
-        u64 enabled = 0, running = 0, now;
+        u64 enabled = 0, running = 0;
        u64 read_format = event->attr.read_format;
        /*
@@ -4039,7 +3896,7 @@ static void perf_output_read(struct perf_output_handle *handle,
         * NMI context
         */
        if (read_format & PERF_FORMAT_TOTAL_TIMES)
-                calc_timer_values(event, &now, &enabled, &running);
+                calc_timer_values(event, &enabled, &running);
        if (event->attr.read_format & PERF_FORMAT_GROUP)
                perf_output_read_group(handle, event, enabled, running);
@@ -4129,46 +3986,6 @@ void perf_output_sample(struct perf_output_handle *handle,
                        }
                }
        }
-        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
-                if (data->br_stack) {
-                        size_t size;
-                        size = data->br_stack->nr
-                             * sizeof(struct perf_branch_entry);
-                        perf_output_put(handle, data->br_stack->nr);
-                        perf_output_copy(handle, data->br_stack->entries, size);
-                } else {
-                        /*
-                         * we always store at least the value of nr
-                         */
-                        u64 nr = 0;
-                        perf_output_put(handle, nr);
-                }
-        }
-        if (sample_type & PERF_SAMPLE_REGS_USER) {
-                u64 abi = data->regs_user.abi;
-                /*
-                 * If there are no regs to dump, notice it through
-                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
-                 */
-                perf_output_put(handle, abi);
-                if (abi) {
-                        u64 mask = event->attr.sample_regs_user;
-                        perf_output_sample_regs(handle,
-                                                data->regs_user.regs,
-                                                mask);
-                }
-        }
-        if (sample_type & PERF_SAMPLE_STACK_USER)
-                perf_output_sample_ustack(handle,
-                                          data->stack_user_size,
-                                          data->regs_user.regs);
 }
 void perf_prepare_sample(struct perf_event_header *header,
@@ -4192,7 +4009,7 @@ void perf_prepare_sample(struct perf_event_header *header,
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;
-                data->callchain = perf_callchain(event, regs);
+                data->callchain = perf_callchain(regs);
                if (data->callchain)
                        size += data->callchain->nr;
@@ -4211,58 +4028,6 @@ void perf_prepare_sample(struct perf_event_header *header,
                WARN_ON_ONCE(size & (sizeof(u64)-1));
                header->size += size;
        }
-        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
-                int size = sizeof(u64); /* nr */
-                if (data->br_stack) {
-                        size += data->br_stack->nr
-                              * sizeof(struct perf_branch_entry);
-                }
-                header->size += size;
-        }
-        if (sample_type & PERF_SAMPLE_REGS_USER) {
-                /* regs dump ABI info */
-                int size = sizeof(u64);
-                perf_sample_regs_user(&data->regs_user, regs);
-                if (data->regs_user.regs) {
-                        u64 mask = event->attr.sample_regs_user;
-                        size += hweight64(mask) * sizeof(u64);
-                }
-                header->size += size;
-        }
-        if (sample_type & PERF_SAMPLE_STACK_USER) {
-                /*
-                 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
-                 * processed as the last one or have additional check added
-                 * in case new sample type is added, because we could eat
-                 * up the rest of the sample size.
-                 */
-                struct perf_regs_user *uregs = &data->regs_user;
-                u16 stack_size = event->attr.sample_stack_user;
-                u16 size = sizeof(u64);
-                if (!uregs->abi)
-                        perf_sample_regs_user(uregs, regs);
-                stack_size = perf_sample_ustack_size(stack_size, header->size,
-                                                     uregs->regs);
-                /*
-                 * If there is something to dump, add space for the dump
-                 * itself and for the field that tells the dynamic size,
-                 * which is how many have been actually dumped.
-                 */
-                if (stack_size)
-                        size += sizeof(u64) + stack_size;
-                data->stack_user_size = stack_size;
-                header->size += size;
-        }
 }
 static void perf_event_output(struct perf_event *event,
@@ -4415,7 +4180,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
        rcu_read_lock();
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-                if (cpuctx->unique_pmu != pmu)
+                if (cpuctx->active_pmu != pmu)
                        goto next;
                perf_event_task_ctx(&cpuctx->ctx, task_event);
@@ -4561,7 +4326,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        rcu_read_lock();
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-                if (cpuctx->unique_pmu != pmu)
+                if (cpuctx->active_pmu != pmu)
                        goto next;
                perf_event_comm_ctx(&cpuctx->ctx, comm_event);
@@ -4757,7 +4522,7 @@ got_name:
        rcu_read_lock();
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-                if (cpuctx->unique_pmu != pmu)
+                if (cpuctx->active_pmu != pmu)
                        goto next;
                perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
                                        vma->vm_flags & VM_EXEC);
@@ -4858,7 +4623,6 @@ static int __perf_event_overflow(struct perf_event *event,
 {
        int events = atomic_read(&event->event_limit);
        struct hw_perf_event *hwc = &event->hw;
-        u64 seq;
        int ret = 0;
        /*
@@ -4868,20 +4632,14 @@ static int __perf_event_overflow(struct perf_event *event,
        if (unlikely(!is_sampling_event(event)))
                return 0;
-        seq = __this_cpu_read(perf_throttled_seq);
+        if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
-        if (seq != hwc->interrupts_seq) {
+                if (throttle) {
-                hwc->interrupts_seq = seq;
-                hwc->interrupts = 1;
-        } else {
-                hwc->interrupts++;
-                if (unlikely(throttle
-                             && hwc->interrupts >= max_samples_per_tick)) {
-                        __this_cpu_inc(perf_throttled_count);
                        hwc->interrupts = MAX_INTERRUPTS;
                        perf_log_throttle(event, 0);
                        ret = 1;
                }
-        }
+        } else
+                hwc->interrupts++;
        if (event->attr.freq) {
                u64 now = perf_clock();
@@ -4890,7 +4648,7 @@ static int __perf_event_overflow(struct perf_event *event,
                hwc->freq_time_stamp = now;
                if (delta > 0 && delta < 2*TICK_NSEC)
-                        perf_adjust_period(event, delta, hwc->last_period, true);
+                        perf_adjust_period(event, delta, hwc->last_period);
        }
        /*
@@ -4978,6 +4736,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;
+        data->period = event->hw.last_period;
        if (!overflow)
                overflow = perf_swevent_set_period(event);
@@ -5011,12 +4770,6 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
        if (!is_sampling_event(event))
                return;
-        if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
-                data->period = nr;
-                return perf_swevent_overflow(event, 1, data, regs);
-        } else
-                data->period = event->hw.last_period;
        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
                return perf_swevent_overflow(event, 1, data, regs);
@@ -5158,7 +4911,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
        if (rctx < 0)
                return;
-        perf_sample_data_init(&data, addr, 0);
+        perf_sample_data_init(&data, addr);
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
@@ -5305,7 +5058,7 @@ fail:
        return err;
 }
-struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 static void sw_perf_event_destroy(struct perf_event *event)
 {
@@ -5313,7 +5066,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
        WARN_ON(event->parent);
-        static_key_slow_dec(&perf_swevent_enabled[event_id]);
+        jump_label_dec(&perf_swevent_enabled[event_id]);
        swevent_hlist_put(event);
 }
@@ -5324,12 +5077,6 @@ static int perf_swevent_init(struct perf_event *event)
        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;
-        /*
-         * no branch sampling for software events
-         */
-        if (has_branch_stack(event))
-                return -EOPNOTSUPP;
        switch (event_id) {
        case PERF_COUNT_SW_CPU_CLOCK:
        case PERF_COUNT_SW_TASK_CLOCK:
@@ -5349,18 +5096,13 @@ static int perf_swevent_init(struct perf_event *event)
                if (err)
                        return err;
-                static_key_slow_inc(&perf_swevent_enabled[event_id]);
+                jump_label_inc(&perf_swevent_enabled[event_id]);
                event->destroy = sw_perf_event_destroy;
        }
        return 0;
 }
-static int perf_swevent_event_idx(struct perf_event *event)
-{
-        return 0;
-}
 static struct pmu perf_swevent = {
        .task_ctx_nr    = perf_sw_context,
@@ -5370,8 +5112,6 @@ static struct pmu perf_swevent = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
-        .event_idx      = perf_swevent_event_idx,
 };
 #ifdef CONFIG_EVENT_TRACING
@@ -5405,8 +5145,7 @@ static int perf_tp_event_match(struct perf_event *event,
 }
 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-                   struct pt_regs *regs, struct hlist_head *head, int rctx,
+                   struct pt_regs *regs, struct hlist_head *head, int rctx)
-                   struct task_struct *task)
 {
        struct perf_sample_data data;
        struct perf_event *event;
@@ -5417,7 +5156,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
                .data = record,
        };
-        perf_sample_data_init(&data, addr, 0);
+        perf_sample_data_init(&data, addr);
        data.raw = &raw;
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
@@ -5425,31 +5164,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
                        perf_swevent_event(event, count, &data, regs);
        }
-        /*
-         * If we got specified a target task, also iterate its context and
-         * deliver this event there too.
-         */
-        if (task && task != current) {
-                struct perf_event_context *ctx;
-                struct trace_entry *entry = record;
-                rcu_read_lock();
-                ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
-                if (!ctx)
-                        goto unlock;
-                list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-                        if (event->attr.type != PERF_TYPE_TRACEPOINT)
-                                continue;
-                        if (event->attr.config != entry->type)
-                                continue;
-                        if (perf_tp_event_match(event, &data, regs))
-                                perf_swevent_event(event, count, &data, regs);
-                }
-unlock:
-                rcu_read_unlock();
-        }
        perf_swevent_put_recursion_context(rctx);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -5466,12 +5180,6 @@ static int perf_tp_event_init(struct perf_event *event)
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -ENOENT;
-        /*
-         * no branch sampling for tracepoint events
-         */
-        if (has_branch_stack(event))
-                return -EOPNOTSUPP;
        err = perf_trace_init(event);
        if (err)
                return err;
@@ -5490,8 +5198,6 @@ static struct pmu perf_tracepoint = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
-        .event_idx      = perf_swevent_event_idx,
 };
 static inline void perf_tp_register(void)
@@ -5545,7 +5251,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
        struct perf_sample_data sample;
        struct pt_regs *regs = data;
-        perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
+        perf_sample_data_init(&sample, bp->attr.bp_addr);
        if (!bp->hw.state && !perf_exclude_event(bp, regs))
                perf_swevent_event(bp, 1, &sample, regs);
@@ -5571,12 +5277,13 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        event->pmu->read(event);
-        perf_sample_data_init(&data, 0, event->hw.last_period);
+        perf_sample_data_init(&data, 0);
+        data.period = event->hw.last_period;
        regs = get_irq_regs();
        if (regs && !perf_exclude_event(event, regs)) {
-                if (!(event->attr.exclude_idle && is_idle_task(current)))
+                if (!(event->attr.exclude_idle && current->pid == 0))
-                        if (__perf_event_overflow(event, 1, &data, regs))
+                        if (perf_event_overflow(event, &data, regs))
                                ret = HRTIMER_NORESTART;
        }
@@ -5696,12 +5403,6 @@ static int cpu_clock_event_init(struct perf_event *event)
        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
                return -ENOENT;
-        /*
-         * no branch sampling for software events
-         */
-        if (has_branch_stack(event))
-                return -EOPNOTSUPP;
        perf_swevent_init_hrtimer(event);
        return 0;
@@ -5716,8 +5417,6 @@ static struct pmu perf_cpu_clock = {
        .start          = cpu_clock_event_start,
        .stop           = cpu_clock_event_stop,
        .read           = cpu_clock_event_read,
-        .event_idx      = perf_swevent_event_idx,
 };
 /*
@@ -5776,12 +5475,6 @@ static int task_clock_event_init(struct perf_event *event)
        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
                return -ENOENT;
-        /*
-         * no branch sampling for software events
-         */
-        if (has_branch_stack(event))
-                return -EOPNOTSUPP;
        perf_swevent_init_hrtimer(event);
        return 0;
@@ -5796,8 +5489,6 @@ static struct pmu perf_task_clock = {
        .start          = task_clock_event_start,
        .stop           = task_clock_event_stop,
        .read           = task_clock_event_read,
-        .event_idx      = perf_swevent_event_idx,
 };
 static void perf_pmu_nop_void(struct pmu *pmu)
@@ -5825,11 +5516,6 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
        perf_pmu_enable(pmu);
 }
-static int perf_event_idx_default(struct perf_event *event)
-{
-        return event->hw.idx + 1;
-}
 /*
 * Ensures all contexts with the same task_ctx_nr have the same
 * pmu_cpu_context too.
@@ -5858,8 +5544,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
-                if (cpuctx->unique_pmu == old_pmu)
+                if (cpuctx->active_pmu == old_pmu)
-                        cpuctx->unique_pmu = pmu;
+                        cpuctx->active_pmu = pmu;
        }
 }
@@ -5916,7 +5602,6 @@ static int pmu_dev_alloc(struct pmu *pmu)
        if (!pmu->dev)
                goto out;
-        pmu->dev->groups = pmu->attr_groups;
        device_initialize(pmu->dev);
        ret = dev_set_name(pmu->dev, "%s", pmu->name);
        if (ret)
@@ -5994,7 +5679,7 @@ skip_type:
                cpuctx->ctx.pmu = pmu;
                cpuctx->jiffies_interval = 1;
                INIT_LIST_HEAD(&cpuctx->rotation_list);
-                cpuctx->unique_pmu = pmu;
+                cpuctx->active_pmu = pmu;
        }
 got_cpu_context:
@@ -6020,9 +5705,6 @@ got_cpu_context:
                pmu->pmu_disable = perf_pmu_nop_void;
        }
-        if (!pmu->event_idx)
-                pmu->event_idx = perf_event_idx_default;
        list_add_rcu(&pmu->entry, &pmus);
        ret = 0;
 unlock:
@@ -6076,7 +5758,6 @@ struct pmu *perf_init_event(struct perf_event *event)
        pmu = idr_find(&pmu_idr, event->attr.type);
        rcu_read_unlock();
        if (pmu) {
-                event->pmu = pmu;
                ret = pmu->event_init(event);
                if (ret)
                        pmu = ERR_PTR(ret);
@@ -6084,7 +5765,6 @@ struct pmu *perf_init_event(struct perf_event *event)
        }
        list_for_each_entry_rcu(pmu, &pmus, entry) {
-                event->pmu = pmu;
                ret = pmu->event_init(event);
                if (!ret)
                        goto unlock;
@@ -6139,14 +5819,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        INIT_LIST_HEAD(&event->group_entry);
        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
-        INIT_LIST_HEAD(&event->rb_entry);
        init_waitqueue_head(&event->waitq);
        init_irq_work(&event->pending, perf_pending_event);
        mutex_init(&event->mmap_mutex);
-        atomic_long_set(&event->refcount, 1);
        event->cpu              = cpu;
        event->attr             = *attr;
        event->group_leader     = group_leader;
@@ -6155,7 +5832,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        event->parent           = parent_event;
-        event->ns               = get_pid_ns(task_active_pid_ns(current));
+        event->ns               = get_pid_ns(current->nsproxy->pid_ns);
        event->id               = atomic64_inc_return(&perf_event_id);
        event->state            = PERF_EVENT_STATE_INACTIVE;
@@ -6214,9 +5891,11 @@ done:
                return ERR_PTR(err);
        }
+        event->pmu = pmu;
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                        static_key_slow_inc(&perf_sched_events.key);
+                        jump_label_inc(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_inc(&nr_mmap_events);
                if (event->attr.comm)
@@ -6230,12 +5909,6 @@ done:
                                return ERR_PTR(err);
                        }
                }
-                if (has_branch_stack(event)) {
-                        static_key_slow_inc(&perf_sched_events.key);
-                        if (!(event->attach_state & PERF_ATTACH_TASK))
-                                atomic_inc(&per_cpu(perf_branch_stack_events,
-                                                    event->cpu));
-                }
        }
        return event;
@@ -6305,62 +5978,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
        if (attr->read_format & ~(PERF_FORMAT_MAX-1))
                return -EINVAL;
-        if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
-                u64 mask = attr->branch_sample_type;
-                /* only using defined bits */
-                if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
-                        return -EINVAL;
-                /* at least one branch bit must be set */
-                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
-                        return -EINVAL;
-                /* kernel level capture: check permissions */
-                if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
-                    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-                        return -EACCES;
-                /* propagate priv level, when not set for branch */
-                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
-                        /* exclude_kernel checked on syscall entry */
-                        if (!attr->exclude_kernel)
-                                mask |= PERF_SAMPLE_BRANCH_KERNEL;
-                        if (!attr->exclude_user)
-                                mask |= PERF_SAMPLE_BRANCH_USER;
-                        if (!attr->exclude_hv)
-                                mask |= PERF_SAMPLE_BRANCH_HV;
-                        /*
-                         * adjust user setting (for HW filter setup)
-                         */
-                        attr->branch_sample_type = mask;
-                }
-        }
-        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
-                ret = perf_reg_validate(attr->sample_regs_user);
-                if (ret)
-                        return ret;
-        }
-        if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
-                if (!arch_perf_have_user_stack_dump())
-                        return -ENOSYS;
-                /*
-                 * We have __u32 type for the size, but so far
-                 * we can only use __u16 as maximum due to the
-                 * __u16 sample size limit.
-                 */
-                if (attr->sample_stack_user >= USHRT_MAX)
-                        ret = -EINVAL;
-                else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
-                        ret = -EINVAL;
-        }
 out:
        return ret;
@@ -6410,8 +6027,6 @@ set:
        old_rb = event->rb;
        rcu_assign_pointer(event->rb, rb);
-        if (old_rb)
-                ring_buffer_detach(event, old_rb);
        ret = 0;
 unlock:
        mutex_unlock(&event->mmap_mutex);
@@ -6439,11 +6054,12 @@ SYSCALL_DEFINE5(perf_event_open,
        struct perf_event_attr attr;
        struct perf_event_context *ctx;
        struct file *event_file = NULL;
-        struct fd group = {NULL, 0};
+        struct file *group_file = NULL;
        struct task_struct *task = NULL;
        struct pmu *pmu;
        int event_fd;
        int move_group = 0;
+        int fput_needed = 0;
        int err;
        /* for future expandability... */
@@ -6473,15 +6089,17 @@ SYSCALL_DEFINE5(perf_event_open,
        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
                return -EINVAL;
-        event_fd = get_unused_fd();
+        event_fd = get_unused_fd_flags(O_RDWR);
        if (event_fd < 0)
                return event_fd;
        if (group_fd != -1) {
-                err = perf_fget_light(group_fd, &group);
+                group_leader = perf_fget_light(group_fd, &fput_needed);
-                if (err)
+                if (IS_ERR(group_leader)) {
+                        err = PTR_ERR(group_leader);
                        goto err_fd;
-                group_leader = group.file->private_data;
+                }
+                group_file = group_leader->filp;
                if (flags & PERF_FLAG_FD_OUTPUT)
                        output_event = group_leader;
                if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6496,8 +6114,6 @@ SYSCALL_DEFINE5(perf_event_open,
                }
        }
-        get_online_cpus();
        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
                                 NULL, NULL);
        if (IS_ERR(event)) {
@@ -6515,7 +6131,7 @@ SYSCALL_DEFINE5(perf_event_open,
                 * - that may need work on context switch
                 */
                atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
-                static_key_slow_inc(&perf_sched_events.key);
+                jump_label_inc(&perf_sched_events);
        }
        /*
@@ -6550,7 +6166,7 @@ SYSCALL_DEFINE5(perf_event_open,
        /*
         * Get the target context (task or percpu):
         */
-        ctx = find_get_context(pmu, task, event->cpu);
+        ctx = find_get_context(pmu, task, cpu);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_alloc;
@@ -6618,27 +6234,25 @@ SYSCALL_DEFINE5(perf_event_open,
                put_ctx(gctx);
        }
+        event->filp = event_file;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        if (move_group) {
-                synchronize_rcu();
+                perf_install_in_context(ctx, group_leader, cpu);
-                perf_install_in_context(ctx, group_leader, event->cpu);
                get_ctx(ctx);
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
-                        perf_install_in_context(ctx, sibling, event->cpu);
+                        perf_install_in_context(ctx, sibling, cpu);
                        get_ctx(ctx);
                }
        }
-        perf_install_in_context(ctx, event, event->cpu);
+        perf_install_in_context(ctx, event, cpu);
        ++ctx->generation;
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
-        put_online_cpus();
        event->owner = current;
        mutex_lock(&current->perf_event_mutex);
@@ -6657,7 +6271,7 @@ SYSCALL_DEFINE5(perf_event_open,
         * of the group leader will find the pointer to itself in
         * perf_group_detach().
         */
-        fdput(group);
+        fput_light(group_file, fput_needed);
        fd_install(event_fd, event_file);
        return event_fd;
@@ -6667,11 +6281,10 @@ err_context:
 err_alloc:
        free_event(event);
 err_task:
-        put_online_cpus();
        if (task)
                put_task_struct(task);
 err_group_fd:
-        fdput(group);
+        fput_light(group_file, fput_needed);
 err_fd:
        put_unused_fd(event_fd);
        return err;
@@ -6711,6 +6324,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                goto err_free;
        }
+        event->filp = NULL;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        perf_install_in_context(ctx, event, cpu);
@@ -6727,39 +6341,6 @@ err:
 }
 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
-{
-        struct perf_event_context *src_ctx;
-        struct perf_event_context *dst_ctx;
-        struct perf_event *event, *tmp;
-        LIST_HEAD(events);
-        src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
-        dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
-        mutex_lock(&src_ctx->mutex);
-        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
-                                 event_entry) {
-                perf_remove_from_context(event);
-                put_ctx(src_ctx);
-                list_add(&event->event_entry, &events);
-        }
-        mutex_unlock(&src_ctx->mutex);
-        synchronize_rcu();
-        mutex_lock(&dst_ctx->mutex);
-        list_for_each_entry_safe(event, tmp, &events, event_entry) {
-                list_del(&event->event_entry);
-                if (event->state >= PERF_EVENT_STATE_OFF)
-                        event->state = PERF_EVENT_STATE_INACTIVE;
-                perf_install_in_context(dst_ctx, event, dst_cpu);
-                get_ctx(dst_ctx);
-        }
-        mutex_unlock(&dst_ctx->mutex);
-}
-EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
 static void sync_child_event(struct perf_event *child_event,
                               struct task_struct *child)
 {
@@ -6792,7 +6373,7 @@ static void sync_child_event(struct perf_event *child_event,
         * Release the parent event, if this was the last
         * reference to it.
         */
-        put_event(parent_event);
+        fput(parent_event->filp);
 }
 static void
@@ -6868,8 +6449,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
         *
         *   __perf_event_exit_task()
         *     sync_child_event()
-         *       put_event()
+         *       fput(parent_event->filp)
-         *         mutex_lock(&ctx->mutex)
+         *         perf_release()
+         *           mutex_lock(&ctx->mutex)
         *
         * But since its the parent context it won't be the same instance.
         */
@@ -6937,7 +6519,7 @@ static void perf_free_event(struct perf_event *event,
        list_del_init(&event->child_list);
        mutex_unlock(&parent->child_mutex);
-        put_event(parent);
+        fput(parent->filp);
        perf_group_detach(event);
        list_del_event(event, ctx);
@@ -7017,12 +6599,6 @@ inherit_event(struct perf_event *parent_event,
                                           NULL, NULL);
        if (IS_ERR(child_event))
                return child_event;
-        if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
-                free_event(child_event);
-                return NULL;
-        }
        get_ctx(child_ctx);
        /*
@@ -7064,6 +6640,14 @@ inherit_event(struct perf_event *parent_event,
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
        /*
+         * Get a reference to the parent filp - we will fput it
+         * when the child event exits. This is safe to do because
+         * we are in the parent and we know that the filp still
+         * exists and has a nonzero count:
+         */
+        atomic_long_inc(&parent_event->filp->f_count);
+        /*
         * Link this into the parent event's child list
         */
        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
@@ -7393,16 +6977,6 @@ void __init perf_event_init(void)
        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
-        /* do not patch jump label more than once per second */
-        jump_label_rate_limit(&perf_sched_events, HZ);
-        /*
-         * Build time assertion that we keep the data_head at the intended
-         * location.  IOW, validation we got the __reserved[] size right.
-         */
-        BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
-                     != 1024);
 }
 static int __init perf_event_sysfs_init(void)
@@ -7434,7 +7008,8 @@ unlock:
 device_initcall(perf_event_sysfs_init);
 #ifdef CONFIG_CGROUP_PERF
-static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
+static struct cgroup_subsys_state *perf_cgroup_create(
+        struct cgroup_subsys *ss, struct cgroup *cont)
 {
        struct perf_cgroup *jc;
@@ -7451,7 +7026,8 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
        return &jc->css;
 }
-static void perf_cgroup_css_free(struct cgroup *cont)
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+                                struct cgroup *cont)
 {
        struct perf_cgroup *jc;
        jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7467,16 +7043,14 @@ static int __perf_cgroup_move(void *info)
        return 0;
 }
-static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+static void
+perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
 {
-        struct task_struct *task;
+        task_function_call(task, __perf_cgroup_move, task);
-        cgroup_taskset_for_each(task, cgrp, tset)
-                task_function_call(task, __perf_cgroup_move, task);
 }
-static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                             struct task_struct *task)
+                struct cgroup *old_cgrp, struct task_struct *task)
 {
        /*
         * cgroup_exit() is called in the copy_process() failure path.
@@ -7486,22 +7060,15 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
        if (!(task->flags & PF_EXITING))
                return;
-        task_function_call(task, __perf_cgroup_move, task);
+        perf_cgroup_attach_task(cgrp, task);
 }
 struct cgroup_subsys perf_subsys = {
        .name           = "perf_event",
        .subsys_id      = perf_subsys_id,
-        .css_alloc      = perf_cgroup_css_alloc,
+        .create         = perf_cgroup_create,
-        .css_free       = perf_cgroup_css_free,
+        .destroy        = perf_cgroup_destroy,
        .exit           = perf_cgroup_exit,
-        .attach         = perf_cgroup_attach,
+        .attach_task    = perf_cgroup_attach_task,
-        /*
-         * perf_event cgroup doesn't handle nesting correctly.
-         * ctx->nr_cgroups adjustments should be propagated through the
-         * cgroup hierarchy.  Fix it and remove the following.
-         */
-        .broken_hierarchy = true,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index fe8a916507e..b7971d6f38b 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -111,16 +111,14 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 * Count the number of breakpoints of the same type and same task.
 * The given event must be not on the list.
 */
-static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
+static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
 {
        struct task_struct *tsk = bp->hw.bp_target;
        struct perf_event *iter;
        int count = 0;
        list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-                if (iter->hw.bp_target == tsk &&
+                if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
-                    find_slot_idx(iter) == type &&
-                    cpu == iter->cpu)
                        count += hw_breakpoint_weight(iter);
        }
@@ -143,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                if (!tsk)
                        slots->pinned += max_task_bp_pinned(cpu, type);
                else
-                        slots->pinned += task_bp_pinned(cpu, bp, type);
+                        slots->pinned += task_bp_pinned(bp, type);
                slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
                return;
@@ -156,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                if (!tsk)
                        nr += max_task_bp_pinned(cpu, type);
                else
-                        nr += task_bp_pinned(cpu, bp, type);
+                        nr += task_bp_pinned(bp, type);
                if (nr > slots->pinned)
                        slots->pinned = nr;
@@ -190,7 +188,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
        int old_idx = 0;
        int idx = 0;
-        old_count = task_bp_pinned(cpu, bp, type);
+        old_count = task_bp_pinned(bp, type);
        old_idx = old_count - 1;
        idx = old_idx + weight;
@@ -455,16 +453,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
        int old_type = bp->attr.bp_type;
        int err = 0;
-        /*
+        perf_event_disable(bp);
-         * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
-         * will not be possible to raise IPIs that invoke __perf_event_disable.
-         * So call the function directly after making sure we are targeting the
-         * current task.
-         */
-        if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
-                __perf_event_disable(bp);
-        else
-                perf_event_disable(bp);
        bp->attr.bp_addr = attr->bp_addr;
        bp->attr.bp_type = attr->bp_type;
@@ -592,12 +581,6 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
        if (bp->attr.type != PERF_TYPE_BREAKPOINT)
                return -ENOENT;
-        /*
-         * no branch sampling for breakpoint events
-         */
-        if (has_branch_stack(bp))
-                return -EOPNOTSUPP;
        err = register_perf_hw_breakpoint(bp);
        if (err)
                return err;
@@ -630,11 +613,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
        bp->hw.state = PERF_HES_STOPPED;
 }
-static int hw_breakpoint_event_idx(struct perf_event *bp)
-{
-        return 0;
-}
 static struct pmu perf_breakpoint = {
        .task_ctx_nr    = perf_sw_context, /* could eventually get its own */
@@ -644,8 +622,6 @@ static struct pmu perf_breakpoint = {
        .start          = hw_breakpoint_start,
        .stop           = hw_breakpoint_stop,
        .read           = hw_breakpoint_pmu_read,
-        .event_idx      = hw_breakpoint_event_idx,
 };
 int __init init_hw_breakpoint(void)
@@ -675,10 +651,10 @@ int __init init_hw_breakpoint(void)
 err_alloc:
        for_each_possible_cpu(err_cpu) {
-                for (i = 0; i < TYPE_MAX; i++)
-                        kfree(per_cpu(nr_task_bp_pinned[i], cpu));
                if (err_cpu == cpu)
                        break;
+                for (i = 0; i < TYPE_MAX; i++)
+                        kfree(per_cpu(nr_task_bp_pinned[i], cpu));
        }
        return -ENOMEM;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c99a8..09097dd8116 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -1,11 +1,6 @@
 #ifndef _KERNEL_EVENTS_INTERNAL_H
 #define _KERNEL_EVENTS_INTERNAL_H
-#include <linux/hardirq.h>
-#include <linux/uaccess.h>
-/* Buffer handling */
 #define RING_BUFFER_WRITABLE            0x01
 struct ring_buffer {
@@ -27,9 +22,6 @@ struct ring_buffer {
        local_t                         lost;           /* nr records lost   */
        long                            watermark;      /* wakeup watermark  */
-        /* poll crap */
-        spinlock_t                      event_lock;
-        struct list_head                event_list;
        struct perf_event_mmap_page     *user_page;
        void                            *data_pages[0];
@@ -72,106 +64,33 @@ static inline int page_order(struct ring_buffer *rb)
 }
 #endif
-static inline unsigned long perf_data_size(struct ring_buffer *rb)
+static unsigned long perf_data_size(struct ring_buffer *rb)
 {
        return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
 }
-#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)                      \
+static inline void
-static inline unsigned int                                              \
+__output_copy(struct perf_output_handle *handle,
-func_name(struct perf_output_handle *handle,                            \
+                   const void *buf, unsigned int len)
-          const void *buf, unsigned int len)                            \
-{                                                                       \
-        unsigned long size, written;                                    \
-                                                                        \
-        do {                                                            \
-                size = min_t(unsigned long, handle->size, len);         \
-                                                                        \
-                written = memcpy_func(handle->addr, buf, size);         \
-                                                                        \
-                len -= written;                                         \
-                handle->addr += written;                                \
-                buf += written;                                         \
-                handle->size -= written;                                \
-                if (!handle->size) {                                    \
-                        struct ring_buffer *rb = handle->rb;            \
-                                                                        \
-                        handle->page++;                                 \
-                        handle->page &= rb->nr_pages - 1;               \
-                        handle->addr = rb->data_pages[handle->page];    \
-                        handle->size = PAGE_SIZE << page_order(rb);     \
-                }                                                       \
-        } while (len && written == size);                               \
-                                                                        \
-        return len;                                                     \
-}
-static inline int memcpy_common(void *dst, const void *src, size_t n)
 {
-        memcpy(dst, src, n);
+        do {
-        return n;
+                unsigned long size = min_t(unsigned long, handle->size, len);
+                memcpy(handle->addr, buf, size);
+                len -= size;
+                handle->addr += size;
+                buf += size;
+                handle->size -= size;
+                if (!handle->size) {
+                        struct ring_buffer *rb = handle->rb;
+                        handle->page++;
+                        handle->page &= rb->nr_pages - 1;
+                        handle->addr = rb->data_pages[handle->page];
+                        handle->size = PAGE_SIZE << page_order(rb);
+                }
+        } while (len);
 }
-DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
-#define MEMCPY_SKIP(dst, src, n) (n)
-DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
-#ifndef arch_perf_out_copy_user
-#define arch_perf_out_copy_user __copy_from_user_inatomic
-#endif
-DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
-/* Callchain handling */
-extern struct perf_callchain_entry *
-perf_callchain(struct perf_event *event, struct pt_regs *regs);
-extern int get_callchain_buffers(void);
-extern void put_callchain_buffers(void);
-static inline int get_recursion_context(int *recursion)
-{
-        int rctx;
-        if (in_nmi())
-                rctx = 3;
-        else if (in_irq())
-                rctx = 2;
-        else if (in_softirq())
-                rctx = 1;
-        else
-                rctx = 0;
-        if (recursion[rctx])
-                return -1;
-        recursion[rctx]++;
-        barrier();
-        return rctx;
-}
-static inline void put_recursion_context(int *recursion, int rctx)
-{
-        barrier();
-        recursion[rctx]--;
-}
-#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
-static inline bool arch_perf_have_user_stack_dump(void)
-{
-        return true;
-}
-#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
-#else
-static inline bool arch_perf_have_user_stack_dump(void)
-{
-        return false;
-}
-#define perf_user_stack_pointer(regs) 0
-#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
 #endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34ff397..a2a29205cc0 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -4,7 +4,7 @@
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright  ��  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 *
 * For licensing details see kernel-base/COPYING
 */
@@ -182,16 +182,10 @@ out:
        return -ENOSPC;
 }
-unsigned int perf_output_copy(struct perf_output_handle *handle,
+void perf_output_copy(struct perf_output_handle *handle,
                      const void *buf, unsigned int len)
 {
-        return __output_copy(handle, buf, len);
+        __output_copy(handle, buf, len);
-}
-unsigned int perf_output_skip(struct perf_output_handle *handle,
-                              unsigned int len)
-{
-        return __output_skip(handle, NULL, len);
 }
 void perf_output_end(struct perf_output_handle *handle)
@@ -215,9 +209,6 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
                rb->writable = 1;
        atomic_set(&rb->refcount, 1);
-        INIT_LIST_HEAD(&rb->event_list);
-        spin_lock_init(&rb->event_lock);
 }
 #ifndef CONFIG_PERF_USE_VMALLOC
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
deleted file mode 100644
index dea7acfbb07..00000000000
--- a/kernel/events/uprobes.c
+++ /dev/null
@@ -1,1627 +0,0 @@
-/*
- * User-space Probes (UProbes)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2008-2012
- * Authors:
- *      Srikar Dronamraju
- *      Jim Keniston
- * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- */
-#include <linux/kernel.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>      /* read_mapping_page */
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/rmap.h>         /* anon_vma_prepare */
-#include <linux/mmu_notifier.h> /* set_pte_at_notify */
-#include <linux/swap.h>         /* try_to_free_swap */
-#include <linux/ptrace.h>       /* user_enable_single_step */
-#include <linux/kdebug.h>       /* notifier mechanism */
-#include "../../mm/internal.h"  /* munlock_vma_page */
-#include <linux/percpu-rwsem.h>
-#include <linux/uprobes.h>
-#define UINSNS_PER_PAGE                 (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
-#define MAX_UPROBE_XOL_SLOTS            UINSNS_PER_PAGE
-static struct rb_root uprobes_tree = RB_ROOT;
-static DEFINE_SPINLOCK(uprobes_treelock);       /* serialize rbtree access */
-#define UPROBES_HASH_SZ 13
-/*
- * We need separate register/unregister and mmap/munmap lock hashes because
- * of mmap_sem nesting.
- *
- * uprobe_register() needs to install probes on (potentially) all processes
- * and thus needs to acquire multiple mmap_sems (consequtively, not
- * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
- * for the particular process doing the mmap.
- *
- * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
- * because of lock order against i_mmap_mutex. This means there's a hole in
- * the register vma iteration where a mmap() can happen.
- *
- * Thus uprobe_register() can race with uprobe_mmap() and we can try and
- * install a probe where one is already installed.
- */
-/* serialize (un)register */
-static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
-#define uprobes_hash(v)         (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
-/* serialize uprobe->pending_list */
-static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
-#define uprobes_mmap_hash(v)    (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
-static struct percpu_rw_semaphore dup_mmap_sem;
-/*
- * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
- * events active at this time.  Probably a fine grained per inode count is
- * better?
- */
-static atomic_t uprobe_events = ATOMIC_INIT(0);
-/* Have a copy of original instruction */
-#define UPROBE_COPY_INSN        0
-/* Dont run handlers when first register/ last unregister in progress*/
-#define UPROBE_RUN_HANDLER      1
-/* Can skip singlestep */
-#define UPROBE_SKIP_SSTEP       2
-struct uprobe {
-        struct rb_node          rb_node;        /* node in the rb tree */
-        atomic_t                ref;
-        struct rw_semaphore     consumer_rwsem;
-        struct mutex            copy_mutex;     /* TODO: kill me and UPROBE_COPY_INSN */
-        struct list_head        pending_list;
-        struct uprobe_consumer  *consumers;
-        struct inode            *inode;         /* Also hold a ref to inode */
-        loff_t                  offset;
-        unsigned long           flags;
-        struct arch_uprobe      arch;
-};
-/*
- * valid_vma: Verify if the specified vma is an executable vma
- * Relax restrictions while unregistering: vm_flags might have
- * changed after breakpoint was inserted.
- *      - is_register: indicates if we are in register context.
- *      - Return 1 if the specified virtual address is in an
- *        executable vma.
- */
-static bool valid_vma(struct vm_area_struct *vma, bool is_register)
-{
-        vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
-        if (is_register)
-                flags |= VM_WRITE;
-        return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
-}
-static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
-{
-        return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-}
-static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
-{
-        return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
-}
-/**
- * __replace_page - replace page in vma by new page.
- * based on replace_page in mm/ksm.c
- *
- * @vma:      vma that holds the pte pointing to page
- * @addr:     address the old @page is mapped at
- * @page:     the cowed page we are replacing by kpage
- * @kpage:    the modified page we replace page by
- *
- * Returns 0 on success, -EFAULT on failure.
- */
-static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
-                                struct page *page, struct page *kpage)
-{
-        struct mm_struct *mm = vma->vm_mm;
-        spinlock_t *ptl;
-        pte_t *ptep;
-        int err;
-        /* For mmu_notifiers */
-        const unsigned long mmun_start = addr;
-        const unsigned long mmun_end   = addr + PAGE_SIZE;
-        /* For try_to_free_swap() and munlock_vma_page() below */
-        lock_page(page);
-        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-        err = -EAGAIN;
-        ptep = page_check_address(page, mm, addr, &ptl, 0);
-        if (!ptep)
-                goto unlock;
-        get_page(kpage);
-        page_add_new_anon_rmap(kpage, vma, addr);
-        if (!PageAnon(page)) {
-                dec_mm_counter(mm, MM_FILEPAGES);
-                inc_mm_counter(mm, MM_ANONPAGES);
-        }
-        flush_cache_page(vma, addr, pte_pfn(*ptep));
-        ptep_clear_flush(vma, addr, ptep);
-        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
-        page_remove_rmap(page);
-        if (!page_mapped(page))
-                try_to_free_swap(page);
-        pte_unmap_unlock(ptep, ptl);
-        if (vma->vm_flags & VM_LOCKED)
-                munlock_vma_page(page);
-        put_page(page);
-        err = 0;
- unlock:
-        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-        unlock_page(page);
-        return err;
-}
-/**
- * is_swbp_insn - check if instruction is breakpoint instruction.
- * @insn: instruction to be checked.
- * Default implementation of is_swbp_insn
- * Returns true if @insn is a breakpoint instruction.
- */
-bool __weak is_swbp_insn(uprobe_opcode_t *insn)
-{
-        return *insn == UPROBE_SWBP_INSN;
-}
-static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
-{
-        void *kaddr = kmap_atomic(page);
-        memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE);
-        kunmap_atomic(kaddr);
-}
-static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
-{
-        uprobe_opcode_t old_opcode;
-        bool is_swbp;
-        copy_opcode(page, vaddr, &old_opcode);
-        is_swbp = is_swbp_insn(&old_opcode);
-        if (is_swbp_insn(new_opcode)) {
-                if (is_swbp)            /* register: already installed? */
-                        return 0;
-        } else {
-                if (!is_swbp)           /* unregister: was it changed by us? */
-                        return 0;
-        }
-        return 1;
-}
-/*
- * NOTE:
- * Expect the breakpoint instruction to be the smallest size instruction for
- * the architecture. If an arch has variable length instruction and the
- * breakpoint instruction is not of the smallest length instruction
- * supported by that architecture then we need to modify is_swbp_at_addr and
- * write_opcode accordingly. This would never be a problem for archs that
- * have fixed length instructions.
- */
-/*
- * write_opcode - write the opcode at a given virtual address.
- * @mm: the probed process address space.
- * @vaddr: the virtual address to store the opcode.
- * @opcode: opcode to be written at @vaddr.
- *
- * Called with mm->mmap_sem held (for read and with a reference to
- * mm).
- *
- * For mm @mm, write the opcode at @vaddr.
- * Return 0 (success) or a negative errno.
- */
-static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
-                        uprobe_opcode_t opcode)
-{
-        struct page *old_page, *new_page;
-        void *vaddr_old, *vaddr_new;
-        struct vm_area_struct *vma;
-        int ret;
-retry:
-        /* Read the page with vaddr into memory */
-        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
-        if (ret <= 0)
-                return ret;
-        ret = verify_opcode(old_page, vaddr, &opcode);
-        if (ret <= 0)
-                goto put_old;
-        ret = -ENOMEM;
-        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
-        if (!new_page)
-                goto put_old;
-        __SetPageUptodate(new_page);
-        /* copy the page now that we've got it stable */
-        vaddr_old = kmap_atomic(old_page);
-        vaddr_new = kmap_atomic(new_page);
-        memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
-        memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
-        kunmap_atomic(vaddr_new);
-        kunmap_atomic(vaddr_old);
-        ret = anon_vma_prepare(vma);
-        if (ret)
-                goto put_new;
-        ret = __replace_page(vma, vaddr, old_page, new_page);
-put_new:
-        page_cache_release(new_page);
-put_old:
-        put_page(old_page);
-        if (unlikely(ret == -EAGAIN))
-                goto retry;
-        return ret;
-}
-/**
- * set_swbp - store breakpoint at a given address.
- * @auprobe: arch specific probepoint information.
- * @mm: the probed process address space.
- * @vaddr: the virtual address to insert the opcode.
- *
- * For mm @mm, store the breakpoint instruction at @vaddr.
- * Return 0 (success) or a negative errno.
- */
-int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
-{
-        return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
-}
-/**
- * set_orig_insn - Restore the original instruction.
- * @mm: the probed process address space.
- * @auprobe: arch specific probepoint information.
- * @vaddr: the virtual address to insert the opcode.
- *
- * For mm @mm, restore the original opcode (opcode) at @vaddr.
- * Return 0 (success) or a negative errno.
- */
-int __weak
-set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
-{
-        return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
-}
-static int match_uprobe(struct uprobe *l, struct uprobe *r)
-{
-        if (l->inode < r->inode)
-                return -1;
-        if (l->inode > r->inode)
-                return 1;
-        if (l->offset < r->offset)
-                return -1;
-        if (l->offset > r->offset)
-                return 1;
-        return 0;
-}
-static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
-{
-        struct uprobe u = { .inode = inode, .offset = offset };
-        struct rb_node *n = uprobes_tree.rb_node;
-        struct uprobe *uprobe;
-        int match;
-        while (n) {
-                uprobe = rb_entry(n, struct uprobe, rb_node);
-                match = match_uprobe(&u, uprobe);
-                if (!match) {
-                        atomic_inc(&uprobe->ref);
-                        return uprobe;
-                }
-                if (match < 0)
-                        n = n->rb_left;
-                else
-                        n = n->rb_right;
-        }
-        return NULL;
-}
-/*
- * Find a uprobe corresponding to a given inode:offset
- * Acquires uprobes_treelock
- */
-static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
-{
-        struct uprobe *uprobe;
-        spin_lock(&uprobes_treelock);
-        uprobe = __find_uprobe(inode, offset);
-        spin_unlock(&uprobes_treelock);
-        return uprobe;
-}
-static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
-{
-        struct rb_node **p = &uprobes_tree.rb_node;
-        struct rb_node *parent = NULL;
-        struct uprobe *u;
-        int match;
-        while (*p) {
-                parent = *p;
-                u = rb_entry(parent, struct uprobe, rb_node);
-                match = match_uprobe(uprobe, u);
-                if (!match) {
-                        atomic_inc(&u->ref);
-                        return u;
-                }
-                if (match < 0)
-                        p = &parent->rb_left;
-                else
-                        p = &parent->rb_right;
-        }
-        u = NULL;
-        rb_link_node(&uprobe->rb_node, parent, p);
-        rb_insert_color(&uprobe->rb_node, &uprobes_tree);
-        /* get access + creation ref */
-        atomic_set(&uprobe->ref, 2);
-        return u;
-}
-/*
- * Acquire uprobes_treelock.
- * Matching uprobe already exists in rbtree;
- *      increment (access refcount) and return the matching uprobe.
- *
- * No matching uprobe; insert the uprobe in rb_tree;
- *      get a double refcount (access + creation) and return NULL.
- */
-static struct uprobe *insert_uprobe(struct uprobe *uprobe)
-{
-        struct uprobe *u;
-        spin_lock(&uprobes_treelock);
-        u = __insert_uprobe(uprobe);
-        spin_unlock(&uprobes_treelock);
-        /* For now assume that the instruction need not be single-stepped */
-        __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
-        return u;
-}
-static void put_uprobe(struct uprobe *uprobe)
-{
-        if (atomic_dec_and_test(&uprobe->ref))
-                kfree(uprobe);
-}
-static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
-{
-        struct uprobe *uprobe, *cur_uprobe;
-        uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
-        if (!uprobe)
-                return NULL;
-        uprobe->inode = igrab(inode);
-        uprobe->offset = offset;
-        init_rwsem(&uprobe->consumer_rwsem);
-        mutex_init(&uprobe->copy_mutex);
-        /* add to uprobes_tree, sorted on inode:offset */
-        cur_uprobe = insert_uprobe(uprobe);
-        /* a uprobe exists for this inode:offset combination */
-        if (cur_uprobe) {
-                kfree(uprobe);
-                uprobe = cur_uprobe;
-                iput(inode);
-        } else {
-                atomic_inc(&uprobe_events);
-        }
-        return uprobe;
-}
-static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
-{
-        struct uprobe_consumer *uc;
-        if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
-                return;
-        down_read(&uprobe->consumer_rwsem);
-        for (uc = uprobe->consumers; uc; uc = uc->next) {
-                if (!uc->filter || uc->filter(uc, current))
-                        uc->handler(uc, regs);
-        }
-        up_read(&uprobe->consumer_rwsem);
-}
-/* Returns the previous consumer */
-static struct uprobe_consumer *
-consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
-{
-        down_write(&uprobe->consumer_rwsem);
-        uc->next = uprobe->consumers;
-        uprobe->consumers = uc;
-        up_write(&uprobe->consumer_rwsem);
-        return uc->next;
-}
-/*
- * For uprobe @uprobe, delete the consumer @uc.
- * Return true if the @uc is deleted successfully
- * or return false.
- */
-static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
-{
-        struct uprobe_consumer **con;
-        bool ret = false;
-        down_write(&uprobe->consumer_rwsem);
-        for (con = &uprobe->consumers; *con; con = &(*con)->next) {
-                if (*con == uc) {
-                        *con = uc->next;
-                        ret = true;
-                        break;
-                }
-        }
-        up_write(&uprobe->consumer_rwsem);
-        return ret;
-}
-static int
-__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
-                        unsigned long nbytes, loff_t offset)
-{
-        struct page *page;
-        void *vaddr;
-        unsigned long off;
-        pgoff_t idx;
-        if (!filp)
-                return -EINVAL;
-        if (!mapping->a_ops->readpage)
-                return -EIO;
-        idx = offset >> PAGE_CACHE_SHIFT;
-        off = offset & ~PAGE_MASK;
-        /*
-         * Ensure that the page that has the original instruction is
-         * populated and in page-cache.
-         */
-        page = read_mapping_page(mapping, idx, filp);
-        if (IS_ERR(page))
-                return PTR_ERR(page);
-        vaddr = kmap_atomic(page);
-        memcpy(insn, vaddr + off, nbytes);
-        kunmap_atomic(vaddr);
-        page_cache_release(page);
-        return 0;
-}
-static int copy_insn(struct uprobe *uprobe, struct file *filp)
-{
-        struct address_space *mapping;
-        unsigned long nbytes;
-        int bytes;
-        nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
-        mapping = uprobe->inode->i_mapping;
-        /* Instruction at end of binary; copy only available bytes */
-        if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
-                bytes = uprobe->inode->i_size - uprobe->offset;
-        else
-                bytes = MAX_UINSN_BYTES;
-        /* Instruction at the page-boundary; copy bytes in second page */
-        if (nbytes < bytes) {
-                int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
-                                bytes - nbytes, uprobe->offset + nbytes);
-                if (err)
-                        return err;
-                bytes = nbytes;
-        }
-        return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
-}
-static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
-                                struct mm_struct *mm, unsigned long vaddr)
-{
-        int ret = 0;
-        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
-                return ret;
-        mutex_lock(&uprobe->copy_mutex);
-        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
-                goto out;
-        ret = copy_insn(uprobe, file);
-        if (ret)
-                goto out;
-        ret = -ENOTSUPP;
-        if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
-                goto out;
-        ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
-        if (ret)
-                goto out;
-        /* write_opcode() assumes we don't cross page boundary */
-        BUG_ON((uprobe->offset & ~PAGE_MASK) +
-                        UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-        smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
-        set_bit(UPROBE_COPY_INSN, &uprobe->flags);
- out:
-        mutex_unlock(&uprobe->copy_mutex);
-        return ret;
-}
-static int
-install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
-                        struct vm_area_struct *vma, unsigned long vaddr)
-{
-        bool first_uprobe;
-        int ret;
-        /*
-         * If probe is being deleted, unregister thread could be done with
-         * the vma-rmap-walk through. Adding a probe now can be fatal since
-         * nobody will be able to cleanup. Also we could be from fork or
-         * mremap path, where the probe might have already been inserted.
-         * Hence behave as if probe already existed.
-         */
-        if (!uprobe->consumers)
-                return 0;
-        ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
-        if (ret)
-                return ret;
-        /*
-         * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
-         * the task can hit this breakpoint right after __replace_page().
-         */
-        first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
-        if (first_uprobe)
-                set_bit(MMF_HAS_UPROBES, &mm->flags);
-        ret = set_swbp(&uprobe->arch, mm, vaddr);
-        if (!ret)
-                clear_bit(MMF_RECALC_UPROBES, &mm->flags);
-        else if (first_uprobe)
-                clear_bit(MMF_HAS_UPROBES, &mm->flags);
-        return ret;
-}
-static int
-remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
-{
-        /* can happen if uprobe_register() fails */
-        if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
-                return 0;
-        set_bit(MMF_RECALC_UPROBES, &mm->flags);
-        return set_orig_insn(&uprobe->arch, mm, vaddr);
-}
-/*
- * There could be threads that have already hit the breakpoint. They
- * will recheck the current insn and restart if find_uprobe() fails.
- * See find_active_uprobe().
- */
-static void delete_uprobe(struct uprobe *uprobe)
-{
-        spin_lock(&uprobes_treelock);
-        rb_erase(&uprobe->rb_node, &uprobes_tree);
-        spin_unlock(&uprobes_treelock);
-        iput(uprobe->inode);
-        put_uprobe(uprobe);
-        atomic_dec(&uprobe_events);
-}
-struct map_info {
-        struct map_info *next;
-        struct mm_struct *mm;
-        unsigned long vaddr;
-};
-static inline struct map_info *free_map_info(struct map_info *info)
-{
-        struct map_info *next = info->next;
-        kfree(info);
-        return next;
-}
-static struct map_info *
-build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
-{
-        unsigned long pgoff = offset >> PAGE_SHIFT;
-        struct vm_area_struct *vma;
-        struct map_info *curr = NULL;
-        struct map_info *prev = NULL;
-        struct map_info *info;
-        int more = 0;
- again:
-        mutex_lock(&mapping->i_mmap_mutex);
-        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-                if (!valid_vma(vma, is_register))
-                        continue;
-                if (!prev && !more) {
-                        /*
-                         * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
-                         * reclaim. This is optimistic, no harm done if it fails.
-                         */
-                        prev = kmalloc(sizeof(struct map_info),
-                                        GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
-                        if (prev)
-                                prev->next = NULL;
-                }
-                if (!prev) {
-                        more++;
-                        continue;
-                }
-                if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
-                        continue;
-                info = prev;
-                prev = prev->next;
-                info->next = curr;
-                curr = info;
-                info->mm = vma->vm_mm;
-                info->vaddr = offset_to_vaddr(vma, offset);
-        }
-        mutex_unlock(&mapping->i_mmap_mutex);
-        if (!more)
-                goto out;
-        prev = curr;
-        while (curr) {
-                mmput(curr->mm);
-                curr = curr->next;
-        }
-        do {
-                info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
-                if (!info) {
-                        curr = ERR_PTR(-ENOMEM);
-                        goto out;
-                }
-                info->next = prev;
-                prev = info;
-        } while (--more);
-        goto again;
- out:
-        while (prev)
-                prev = free_map_info(prev);
-        return curr;
-}
-static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
-{
-        struct map_info *info;
-        int err = 0;
-        percpu_down_write(&dup_mmap_sem);
-        info = build_map_info(uprobe->inode->i_mapping,
-                                        uprobe->offset, is_register);
-        if (IS_ERR(info)) {
-                err = PTR_ERR(info);
-                goto out;
-        }
-        while (info) {
-                struct mm_struct *mm = info->mm;
-                struct vm_area_struct *vma;
-                if (err && is_register)
-                        goto free;
-                down_write(&mm->mmap_sem);
-                vma = find_vma(mm, info->vaddr);
-                if (!vma || !valid_vma(vma, is_register) ||
-                    vma->vm_file->f_mapping->host != uprobe->inode)
-                        goto unlock;
-                if (vma->vm_start > info->vaddr ||
-                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
-                        goto unlock;
-                if (is_register)
-                        err = install_breakpoint(uprobe, mm, vma, info->vaddr);
-                else
-                        err |= remove_breakpoint(uprobe, mm, info->vaddr);
- unlock:
-                up_write(&mm->mmap_sem);
- free:
-                mmput(mm);
-                info = free_map_info(info);
-        }
- out:
-        percpu_up_write(&dup_mmap_sem);
-        return err;
-}
-static int __uprobe_register(struct uprobe *uprobe)
-{
-        return register_for_each_vma(uprobe, true);
-}
-static void __uprobe_unregister(struct uprobe *uprobe)
-{
-        if (!register_for_each_vma(uprobe, false))
-                delete_uprobe(uprobe);
-        /* TODO : cant unregister? schedule a worker thread */
-}
-/*
- * uprobe_register - register a probe
- * @inode: the file in which the probe has to be placed.
- * @offset: offset from the start of the file.
- * @uc: information on howto handle the probe..
- *
- * Apart from the access refcount, uprobe_register() takes a creation
- * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
- * inserted into the rbtree (i.e first consumer for a @inode:@offset
- * tuple).  Creation refcount stops uprobe_unregister from freeing the
- * @uprobe even before the register operation is complete. Creation
- * refcount is released when the last @uc for the @uprobe
- * unregisters.
- *
- * Return errno if it cannot successully install probes
- * else return 0 (success)
- */
-int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
-{
-        struct uprobe *uprobe;
-        int ret;
-        if (!inode || !uc || uc->next)
-                return -EINVAL;
-        if (offset > i_size_read(inode))
-                return -EINVAL;
-        ret = 0;
-        mutex_lock(uprobes_hash(inode));
-        uprobe = alloc_uprobe(inode, offset);
-        if (!uprobe) {
-                ret = -ENOMEM;
-        } else if (!consumer_add(uprobe, uc)) {
-                ret = __uprobe_register(uprobe);
-                if (ret) {
-                        uprobe->consumers = NULL;
-                        __uprobe_unregister(uprobe);
-                } else {
-                        set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
-                }
-        }
-        mutex_unlock(uprobes_hash(inode));
-        if (uprobe)
-                put_uprobe(uprobe);
-        return ret;
-}
-/*
- * uprobe_unregister - unregister a already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
- * @uc: identify which probe if multiple probes are colocated.
- */
-void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
-{
-        struct uprobe *uprobe;
-        if (!inode || !uc)
-                return;
-        uprobe = find_uprobe(inode, offset);
-        if (!uprobe)
-                return;
-        mutex_lock(uprobes_hash(inode));
-        if (consumer_del(uprobe, uc)) {
-                if (!uprobe->consumers) {
-                        __uprobe_unregister(uprobe);
-                        clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
-                }
-        }
-        mutex_unlock(uprobes_hash(inode));
-        if (uprobe)
-                put_uprobe(uprobe);
-}
-static struct rb_node *
-find_node_in_range(struct inode *inode, loff_t min, loff_t max)
-{
-        struct rb_node *n = uprobes_tree.rb_node;
-        while (n) {
-                struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
-                if (inode < u->inode) {
-                        n = n->rb_left;
-                } else if (inode > u->inode) {
-                        n = n->rb_right;
-                } else {
-                        if (max < u->offset)
-                                n = n->rb_left;
-                        else if (min > u->offset)
-                                n = n->rb_right;
-                        else
-                                break;
-                }
-        }
-        return n;
-}
-/*
- * For a given range in vma, build a list of probes that need to be inserted.
- */
-static void build_probe_list(struct inode *inode,
-                                struct vm_area_struct *vma,
-                                unsigned long start, unsigned long end,
-                                struct list_head *head)
-{
-        loff_t min, max;
-        struct rb_node *n, *t;
-        struct uprobe *u;
-        INIT_LIST_HEAD(head);
-        min = vaddr_to_offset(vma, start);
-        max = min + (end - start) - 1;
-        spin_lock(&uprobes_treelock);
-        n = find_node_in_range(inode, min, max);
-        if (n) {
-                for (t = n; t; t = rb_prev(t)) {
-                        u = rb_entry(t, struct uprobe, rb_node);
-                        if (u->inode != inode || u->offset < min)
-                                break;
-                        list_add(&u->pending_list, head);
-                        atomic_inc(&u->ref);
-                }
-                for (t = n; (t = rb_next(t)); ) {
-                        u = rb_entry(t, struct uprobe, rb_node);
-                        if (u->inode != inode || u->offset > max)
-                                break;
-                        list_add(&u->pending_list, head);
-                        atomic_inc(&u->ref);
-                }
-        }
-        spin_unlock(&uprobes_treelock);
-}
-/*
- * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
- *
- * Currently we ignore all errors and always return 0, the callers
- * can't handle the failure anyway.
- */
-int uprobe_mmap(struct vm_area_struct *vma)
-{
-        struct list_head tmp_list;
-        struct uprobe *uprobe, *u;
-        struct inode *inode;
-        if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
-                return 0;
-        inode = vma->vm_file->f_mapping->host;
-        if (!inode)
-                return 0;
-        mutex_lock(uprobes_mmap_hash(inode));
-        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
-        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
-                if (!fatal_signal_pending(current)) {
-                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
-                        install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
-                }
-                put_uprobe(uprobe);
-        }
-        mutex_unlock(uprobes_mmap_hash(inode));
-        return 0;
-}
-static bool
-vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
-{
-        loff_t min, max;
-        struct inode *inode;
-        struct rb_node *n;
-        inode = vma->vm_file->f_mapping->host;
-        min = vaddr_to_offset(vma, start);
-        max = min + (end - start) - 1;
-        spin_lock(&uprobes_treelock);
-        n = find_node_in_range(inode, min, max);
-        spin_unlock(&uprobes_treelock);
-        return !!n;
-}
-/*
- * Called in context of a munmap of a vma.
- */
-void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
-{
-        if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
-                return;
-        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
-                return;
-        if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
-             test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
-                return;
-        if (vma_has_uprobes(vma, start, end))
-                set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
-}
-/* Slot allocation for XOL */
-static int xol_add_vma(struct xol_area *area)
-{
-        struct mm_struct *mm;
-        int ret;
-        area->page = alloc_page(GFP_HIGHUSER);
-        if (!area->page)
-                return -ENOMEM;
-        ret = -EALREADY;
-        mm = current->mm;
-        down_write(&mm->mmap_sem);
-        if (mm->uprobes_state.xol_area)
-                goto fail;
-        ret = -ENOMEM;
-        /* Try to map as high as possible, this is only a hint. */
-        area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
-        if (area->vaddr & ~PAGE_MASK) {
-                ret = area->vaddr;
-                goto fail;
-        }
-        ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
-                                VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
-        if (ret)
-                goto fail;
-        smp_wmb();      /* pairs with get_xol_area() */
-        mm->uprobes_state.xol_area = area;
-        ret = 0;
-fail:
-        up_write(&mm->mmap_sem);
-        if (ret)
-                __free_page(area->page);
-        return ret;
-}
-static struct xol_area *get_xol_area(struct mm_struct *mm)
-{
-        struct xol_area *area;
-        area = mm->uprobes_state.xol_area;
-        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
-        return area;
-}
-/*
- * xol_alloc_area - Allocate process's xol_area.
- * This area will be used for storing instructions for execution out of
- * line.
- *
- * Returns the allocated area or NULL.
- */
-static struct xol_area *xol_alloc_area(void)
-{
-        struct xol_area *area;
-        area = kzalloc(sizeof(*area), GFP_KERNEL);
-        if (unlikely(!area))
-                return NULL;
-        area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
-        if (!area->bitmap)
-                goto fail;
-        init_waitqueue_head(&area->wq);
-        if (!xol_add_vma(area))
-                return area;
-fail:
-        kfree(area->bitmap);
-        kfree(area);
-        return get_xol_area(current->mm);
-}
-/*
- * uprobe_clear_state - Free the area allocated for slots.
- */
-void uprobe_clear_state(struct mm_struct *mm)
-{
-        struct xol_area *area = mm->uprobes_state.xol_area;
-        if (!area)
-                return;
-        put_page(area->page);
-        kfree(area->bitmap);
-        kfree(area);
-}
-void uprobe_start_dup_mmap(void)
-{
-        percpu_down_read(&dup_mmap_sem);
-}
-void uprobe_end_dup_mmap(void)
-{
-        percpu_up_read(&dup_mmap_sem);
-}
-void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
-{
-        newmm->uprobes_state.xol_area = NULL;
-        if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
-                set_bit(MMF_HAS_UPROBES, &newmm->flags);
-                /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
-                set_bit(MMF_RECALC_UPROBES, &newmm->flags);
-        }
-}
-/*
- *  - search for a free slot.
- */
-static unsigned long xol_take_insn_slot(struct xol_area *area)
-{
-        unsigned long slot_addr;
-        int slot_nr;
-        do {
-                slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
-                if (slot_nr < UINSNS_PER_PAGE) {
-                        if (!test_and_set_bit(slot_nr, area->bitmap))
-                                break;
-                        slot_nr = UINSNS_PER_PAGE;
-                        continue;
-                }
-                wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
-        } while (slot_nr >= UINSNS_PER_PAGE);
-        slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
-        atomic_inc(&area->slot_count);
-        return slot_addr;
-}
-/*
- * xol_get_insn_slot - If was not allocated a slot, then
- * allocate a slot.
- * Returns the allocated slot address or 0.
- */
-static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
-{
-        struct xol_area *area;
-        unsigned long offset;
-        void *vaddr;
-        area = get_xol_area(current->mm);
-        if (!area) {
-                area = xol_alloc_area();
-                if (!area)
-                        return 0;
-        }
-        current->utask->xol_vaddr = xol_take_insn_slot(area);
-        /*
-         * Initialize the slot if xol_vaddr points to valid
-         * instruction slot.
-         */
-        if (unlikely(!current->utask->xol_vaddr))
-                return 0;
-        current->utask->vaddr = slot_addr;
-        offset = current->utask->xol_vaddr & ~PAGE_MASK;
-        vaddr = kmap_atomic(area->page);
-        memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
-        kunmap_atomic(vaddr);
-        /*
-         * We probably need flush_icache_user_range() but it needs vma.
-         * This should work on supported architectures too.
-         */
-        flush_dcache_page(area->page);
-        return current->utask->xol_vaddr;
-}
-/*
- * xol_free_insn_slot - If slot was earlier allocated by
- * @xol_get_insn_slot(), make the slot available for
- * subsequent requests.
- */
-static void xol_free_insn_slot(struct task_struct *tsk)
-{
-        struct xol_area *area;
-        unsigned long vma_end;
-        unsigned long slot_addr;
-        if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
-                return;
-        slot_addr = tsk->utask->xol_vaddr;
-        if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
-                return;
-        area = tsk->mm->uprobes_state.xol_area;
-        vma_end = area->vaddr + PAGE_SIZE;
-        if (area->vaddr <= slot_addr && slot_addr < vma_end) {
-                unsigned long offset;
-                int slot_nr;
-                offset = slot_addr - area->vaddr;
-                slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
-                if (slot_nr >= UINSNS_PER_PAGE)
-                        return;
-                clear_bit(slot_nr, area->bitmap);
-                atomic_dec(&area->slot_count);
-                if (waitqueue_active(&area->wq))
-                        wake_up(&area->wq);
-                tsk->utask->xol_vaddr = 0;
-        }
-}
-/**
- * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
- * @regs: Reflects the saved state of the task after it has hit a breakpoint
- * instruction.
- * Return the address of the breakpoint instruction.
- */
-unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
-{
-        return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
-}
-/*
- * Called with no locks held.
- * Called in context of a exiting or a exec-ing thread.
- */
-void uprobe_free_utask(struct task_struct *t)
-{
-        struct uprobe_task *utask = t->utask;
-        if (!utask)
-                return;
-        if (utask->active_uprobe)
-                put_uprobe(utask->active_uprobe);
-        xol_free_insn_slot(t);
-        kfree(utask);
-        t->utask = NULL;
-}
-/*
- * Called in context of a new clone/fork from copy_process.
- */
-void uprobe_copy_process(struct task_struct *t)
-{
-        t->utask = NULL;
-}
-/*
- * Allocate a uprobe_task object for the task.
- * Called when the thread hits a breakpoint for the first time.
- *
- * Returns:
- * - pointer to new uprobe_task on success
- * - NULL otherwise
- */
-static struct uprobe_task *add_utask(void)
-{
-        struct uprobe_task *utask;
-        utask = kzalloc(sizeof *utask, GFP_KERNEL);
-        if (unlikely(!utask))
-                return NULL;
-        current->utask = utask;
-        return utask;
-}
-/* Prepare to single-step probed instruction out of line. */
-static int
-pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
-{
-        if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
-                return 0;
-        return -EFAULT;
-}
-/*
- * If we are singlestepping, then ensure this thread is not connected to
- * non-fatal signals until completion of singlestep.  When xol insn itself
- * triggers the signal,  restart the original insn even if the task is
- * already SIGKILL'ed (since coredump should report the correct ip).  This
- * is even more important if the task has a handler for SIGSEGV/etc, The
- * _same_ instruction should be repeated again after return from the signal
- * handler, and SSTEP can never finish in this case.
- */
-bool uprobe_deny_signal(void)
-{
-        struct task_struct *t = current;
-        struct uprobe_task *utask = t->utask;
-        if (likely(!utask || !utask->active_uprobe))
-                return false;
-        WARN_ON_ONCE(utask->state != UTASK_SSTEP);
-        if (signal_pending(t)) {
-                spin_lock_irq(&t->sighand->siglock);
-                clear_tsk_thread_flag(t, TIF_SIGPENDING);
-                spin_unlock_irq(&t->sighand->siglock);
-                if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
-                        utask->state = UTASK_SSTEP_TRAPPED;
-                        set_tsk_thread_flag(t, TIF_UPROBE);
-                        set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
-                }
-        }
-        return true;
-}
-/*
- * Avoid singlestepping the original instruction if the original instruction
- * is a NOP or can be emulated.
- */
-static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
-{
-        if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
-                if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
-                        return true;
-                clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
-        }
-        return false;
-}
-static void mmf_recalc_uprobes(struct mm_struct *mm)
-{
-        struct vm_area_struct *vma;
-        for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                if (!valid_vma(vma, false))
-                        continue;
-                /*
-                 * This is not strictly accurate, we can race with
-                 * uprobe_unregister() and see the already removed
-                 * uprobe if delete_uprobe() was not yet called.
-                 */
-                if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
-                        return;
-        }
-        clear_bit(MMF_HAS_UPROBES, &mm->flags);
-}
-static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
-{
-        struct page *page;
-        uprobe_opcode_t opcode;
-        int result;
-        pagefault_disable();
-        result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
-                                                        sizeof(opcode));
-        pagefault_enable();
-        if (likely(result == 0))
-                goto out;
-        result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
-        if (result < 0)
-                return result;
-        copy_opcode(page, vaddr, &opcode);
-        put_page(page);
- out:
-        return is_swbp_insn(&opcode);
-}
-static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
-{
-        struct mm_struct *mm = current->mm;
-        struct uprobe *uprobe = NULL;
-        struct vm_area_struct *vma;
-        down_read(&mm->mmap_sem);
-        vma = find_vma(mm, bp_vaddr);
-        if (vma && vma->vm_start <= bp_vaddr) {
-                if (valid_vma(vma, false)) {
-                        struct inode *inode = vma->vm_file->f_mapping->host;
-                        loff_t offset = vaddr_to_offset(vma, bp_vaddr);
-                        uprobe = find_uprobe(inode, offset);
-                }
-                if (!uprobe)
-                        *is_swbp = is_swbp_at_addr(mm, bp_vaddr);
-        } else {
-                *is_swbp = -EFAULT;
-        }
-        if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
-                mmf_recalc_uprobes(mm);
-        up_read(&mm->mmap_sem);
-        return uprobe;
-}
-/*
- * Run handler and ask thread to singlestep.
- * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
- */
-static void handle_swbp(struct pt_regs *regs)
-{
-        struct uprobe_task *utask;
-        struct uprobe *uprobe;
-        unsigned long bp_vaddr;
-        int uninitialized_var(is_swbp);
-        bp_vaddr = uprobe_get_swbp_addr(regs);
-        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
-        if (!uprobe) {
-                if (is_swbp > 0) {
-                        /* No matching uprobe; signal SIGTRAP. */
-                        send_sig(SIGTRAP, current, 0);
-                } else {
-                        /*
-                         * Either we raced with uprobe_unregister() or we can't
-                         * access this memory. The latter is only possible if
-                         * another thread plays with our ->mm. In both cases
-                         * we can simply restart. If this vma was unmapped we
-                         * can pretend this insn was not executed yet and get
-                         * the (correct) SIGSEGV after restart.
-                         */
-                        instruction_pointer_set(regs, bp_vaddr);
-                }
-                return;
-        }
-        /*
-         * TODO: move copy_insn/etc into _register and remove this hack.
-         * After we hit the bp, _unregister + _register can install the
-         * new and not-yet-analyzed uprobe at the same address, restart.
-         */
-        smp_rmb(); /* pairs with wmb() in install_breakpoint() */
-        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
-                goto restart;
-        utask = current->utask;
-        if (!utask) {
-                utask = add_utask();
-                /* Cannot allocate; re-execute the instruction. */
-                if (!utask)
-                        goto restart;
-        }
-        handler_chain(uprobe, regs);
-        if (can_skip_sstep(uprobe, regs))
-                goto out;
-        if (!pre_ssout(uprobe, regs, bp_vaddr)) {
-                utask->active_uprobe = uprobe;
-                utask->state = UTASK_SSTEP;
-                return;
-        }
-restart:
-        /*
-         * cannot singlestep; cannot skip instruction;
-         * re-execute the instruction.
-         */
-        instruction_pointer_set(regs, bp_vaddr);
-out:
-        put_uprobe(uprobe);
-}
-/*
- * Perform required fix-ups and disable singlestep.
- * Allow pending signals to take effect.
- */
-static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
-{
-        struct uprobe *uprobe;
-        uprobe = utask->active_uprobe;
-        if (utask->state == UTASK_SSTEP_ACK)
-                arch_uprobe_post_xol(&uprobe->arch, regs);
-        else if (utask->state == UTASK_SSTEP_TRAPPED)
-                arch_uprobe_abort_xol(&uprobe->arch, regs);
-        else
-                WARN_ON_ONCE(1);
-        put_uprobe(uprobe);
-        utask->active_uprobe = NULL;
-        utask->state = UTASK_RUNNING;
-        xol_free_insn_slot(current);
-        spin_lock_irq(&current->sighand->siglock);
-        recalc_sigpending(); /* see uprobe_deny_signal() */
-        spin_unlock_irq(&current->sighand->siglock);
-}
-/*
- * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
- * allows the thread to return from interrupt. After that handle_swbp()
- * sets utask->active_uprobe.
- *
- * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
- * and allows the thread to return from interrupt.
- *
- * While returning to userspace, thread notices the TIF_UPROBE flag and calls
- * uprobe_notify_resume().
- */
-void uprobe_notify_resume(struct pt_regs *regs)
-{
-        struct uprobe_task *utask;
-        clear_thread_flag(TIF_UPROBE);
-        utask = current->utask;
-        if (utask && utask->active_uprobe)
-                handle_singlestep(utask, regs);
-        else
-                handle_swbp(regs);
-}
-/*
- * uprobe_pre_sstep_notifier gets called from interrupt context as part of
- * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
- */
-int uprobe_pre_sstep_notifier(struct pt_regs *regs)
-{
-        if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
-                return 0;
-        set_thread_flag(TIF_UPROBE);
-        return 1;
-}
-/*
- * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
- * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
- */
-int uprobe_post_sstep_notifier(struct pt_regs *regs)
-{
-        struct uprobe_task *utask = current->utask;
-        if (!current->mm || !utask || !utask->active_uprobe)
-                /* task is currently not uprobed */
-                return 0;
-        utask->state = UTASK_SSTEP_ACK;
-        set_thread_flag(TIF_UPROBE);
-        return 1;
-}
-static struct notifier_block uprobe_exception_nb = {
-        .notifier_call          = arch_uprobe_exception_notify,
-        .priority               = INT_MAX-1,    /* notified after kprobes, kgdb */
-};
-static int __init init_uprobes(void)
-{
-        int i;
-        for (i = 0; i < UPROBES_HASH_SZ; i++) {
-                mutex_init(&uprobes_mutex[i]);
-                mutex_init(&uprobes_mmap_mutex[i]);
-        }
-        if (percpu_init_rwsem(&dup_mmap_sem))
-                return -ENOMEM;
-        return register_die_notifier(&uprobe_exception_nb);
-}
-module_init(init_uprobes);
-static void __exit exit_uprobes(void)
-{
-}
-module_exit(exit_uprobes);
diff --git a/kernel/exit.c b/kernel/exit.c
index b4df2193721..9e316ae4984 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,8 +51,6 @@
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
-#include <linux/writeback.h>
-#include <linux/shm.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -123,9 +121,9 @@ static void __exit_signal(struct task_struct *tsk)
                 * We won't ever get here for the group leader, since it
                 * will have been the last reference on the signal_struct.
                 */
-                sig->utime += tsk->utime;
+                sig->utime = cputime_add(sig->utime, tsk->utime);
-                sig->stime += tsk->stime;
+                sig->stime = cputime_add(sig->stime, tsk->stime);
-                sig->gtime += tsk->gtime;
+                sig->gtime = cputime_add(sig->gtime, tsk->gtime);
                sig->min_flt += tsk->min_flt;
                sig->maj_flt += tsk->maj_flt;
                sig->nvcsw += tsk->nvcsw;
@@ -310,6 +308,43 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
        }
 }
+/**
+ * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
+ *
+ * If a kernel thread is launched as a result of a system call, or if
+ * it ever exits, it should generally reparent itself to kthreadd so it
+ * isn't in the way of other processes and is correctly cleaned up on exit.
+ *
+ * The various task state such as scheduling policy and priority may have
+ * been inherited from a user process, so we reset them to sane values here.
+ *
+ * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
+ */
+static void reparent_to_kthreadd(void)
+{
+        write_lock_irq(&tasklist_lock);
+        ptrace_unlink(current);
+        /* Reparent to init */
+        current->real_parent = current->parent = kthreadd_task;
+        list_move_tail(&current->sibling, &current->real_parent->children);
+        /* Set the exit signal to SIGCHLD so we signal init on exit */
+        current->exit_signal = SIGCHLD;
+        if (task_nice(current) < 0)
+                set_user_nice(current, 0);
+        /* cpus_allowed? */
+        /* rt_priority? */
+        /* signals? */
+        memcpy(current->signal->rlim, init_task.signal->rlim,
+               sizeof(current->signal->rlim));
+        atomic_inc(&init_cred.usage);
+        commit_creds(&init_cred);
+        write_unlock_irq(&tasklist_lock);
+}
 void __set_special_pids(struct pid *pid)
 {
        struct task_struct *curr = current->group_leader;
@@ -321,6 +356,13 @@ void __set_special_pids(struct pid *pid)
                change_pid(curr, PIDTYPE_PGID, pid);
 }
+static void set_special_pids(struct pid *pid)
+{
+        write_lock_irq(&tasklist_lock);
+        __set_special_pids(pid);
+        write_unlock_irq(&tasklist_lock);
+}
 /*
 * Let kernel threads use this to say that they allow a certain signal.
 * Must not be used if kthread was cloned with CLONE_SIGHAND.
@@ -360,6 +402,149 @@ int disallow_signal(int sig)
 EXPORT_SYMBOL(disallow_signal);
+/*
+ *      Put all the gunge required to become a kernel thread without
+ *      attached user resources in one place where it belongs.
+ */
+void daemonize(const char *name, ...)
+{
+        va_list args;
+        sigset_t blocked;
+        va_start(args, name);
+        vsnprintf(current->comm, sizeof(current->comm), name, args);
+        va_end(args);
+        /*
+         * If we were started as result of loading a module, close all of the
+         * user space pages.  We don't need them, and if we didn't close them
+         * they would be locked into memory.
+         */
+        exit_mm(current);
+        /*
+         * We don't want to have TIF_FREEZE set if the system-wide hibernation
+         * or suspend transition begins right now.
+         */
+        current->flags |= (PF_NOFREEZE | PF_KTHREAD);
+        if (current->nsproxy != &init_nsproxy) {
+                get_nsproxy(&init_nsproxy);
+                switch_task_namespaces(current, &init_nsproxy);
+        }
+        set_special_pids(&init_struct_pid);
+        proc_clear_tty(current);
+        /* Block and flush all signals */
+        sigfillset(&blocked);
+        sigprocmask(SIG_BLOCK, &blocked, NULL);
+        flush_signals(current);
+        /* Become as one with the init task */
+        daemonize_fs_struct();
+        exit_files(current);
+        current->files = init_task.files;
+        atomic_inc(&current->files->count);
+        reparent_to_kthreadd();
+}
+EXPORT_SYMBOL(daemonize);
+static void close_files(struct files_struct * files)
+{
+        int i, j;
+        struct fdtable *fdt;
+        j = 0;
+        /*
+         * It is safe to dereference the fd table without RCU or
+         * ->file_lock because this is the last reference to the
+         * files structure.  But use RCU to shut RCU-lockdep up.
+         */
+        rcu_read_lock();
+        fdt = files_fdtable(files);
+        rcu_read_unlock();
+        for (;;) {
+                unsigned long set;
+                i = j * __NFDBITS;
+                if (i >= fdt->max_fds)
+                        break;
+                set = fdt->open_fds->fds_bits[j++];
+                while (set) {
+                        if (set & 1) {
+                                struct file * file = xchg(&fdt->fd[i], NULL);
+                                if (file) {
+                                        filp_close(file, files);
+                                        cond_resched();
+                                }
+                        }
+                        i++;
+                        set >>= 1;
+                }
+        }
+}
+struct files_struct *get_files_struct(struct task_struct *task)
+{
+        struct files_struct *files;
+        task_lock(task);
+        files = task->files;
+        if (files)
+                atomic_inc(&files->count);
+        task_unlock(task);
+        return files;
+}
+void put_files_struct(struct files_struct *files)
+{
+        struct fdtable *fdt;
+        if (atomic_dec_and_test(&files->count)) {
+                close_files(files);
+                /*
+                 * Free the fd and fdset arrays if we expanded them.
+                 * If the fdtable was embedded, pass files for freeing
+                 * at the end of the RCU grace period. Otherwise,
+                 * you can free files immediately.
+                 */
+                rcu_read_lock();
+                fdt = files_fdtable(files);
+                if (fdt != &files->fdtab)
+                        kmem_cache_free(files_cachep, files);
+                free_fdtable(fdt);
+                rcu_read_unlock();
+        }
+}
+void reset_files_struct(struct files_struct *files)
+{
+        struct task_struct *tsk = current;
+        struct files_struct *old;
+        old = tsk->files;
+        task_lock(tsk);
+        tsk->files = files;
+        task_unlock(tsk);
+        put_files_struct(old);
+}
+void exit_files(struct task_struct *tsk)
+{
+        struct files_struct * files = tsk->files;
+        if (files) {
+                task_lock(tsk);
+                tsk->files = NULL;
+                task_unlock(tsk);
+                put_files_struct(files);
+        }
+}
 #ifdef CONFIG_MM_OWNER
 /*
 * A task is exiting.   If it owned this mm, find a new owner for the mm.
@@ -456,7 +641,6 @@ static void exit_mm(struct task_struct * tsk)
        mm_release(tsk, mm);
        if (!mm)
                return;
-        sync_mm_rss(mm);
        /*
         * Serialize with any possible pending coredump.
         * We must hold mmap_sem around checking core_state
@@ -495,17 +679,21 @@ static void exit_mm(struct task_struct * tsk)
        tsk->mm = NULL;
        up_read(&mm->mmap_sem);
        enter_lazy_tlb(mm, current);
+        /* We don't want this task to be frozen prematurely */
+        clear_freeze_flag(tsk);
+        if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                atomic_dec(&mm->oom_disable_count);
        task_unlock(tsk);
        mm_update_next_owner(mm);
        mmput(mm);
 }
 /*
- * When we die, we re-parent all our children, and try to:
+ * When we die, we re-parent all our children.
- * 1. give them to another thread in our thread group, if such a member exists
+ * Try to give them to another thread in our thread
- * 2. give it to the first ancestor process which prctl'd itself as a
+ * group, and if no such member exists, give it to
- *    child_subreaper for its children (like a service manager)
+ * the child reaper process (ie "init") in our pid
- * 3. give it to the init process (PID 1) in our pid namespace
+ * space.
 */
 static struct task_struct *find_new_reaper(struct task_struct *father)
        __releases(&tasklist_lock)
@@ -525,37 +713,17 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
        if (unlikely(pid_ns->child_reaper == father)) {
                write_unlock_irq(&tasklist_lock);
-                if (unlikely(pid_ns == &init_pid_ns)) {
+                if (unlikely(pid_ns == &init_pid_ns))
-                        panic("Attempted to kill init! exitcode=0x%08x\n",
+                        panic("Attempted to kill init!");
-                                father->signal->group_exit_code ?:
-                                        father->exit_code);
-                }
                zap_pid_ns_processes(pid_ns);
                write_lock_irq(&tasklist_lock);
-        } else if (father->signal->has_child_subreaper) {
-                struct task_struct *reaper;
                /*
-                 * Find the first ancestor marked as child_subreaper.
+                 * We can not clear ->child_reaper or leave it alone.
-                 * Note that the code below checks same_thread_group(reaper,
+                 * There may by stealth EXIT_DEAD tasks on ->children,
-                 * pid_ns->child_reaper).  This is what we need to DTRT in a
+                 * forget_original_parent() must move them somewhere.
-                 * PID namespace. However we still need the check above, see
-                 * http://marc.info/?l=linux-kernel&m=131385460420380
                 */
-                for (reaper = father->real_parent;
+                pid_ns->child_reaper = init_pid_ns.child_reaper;
-                     reaper != &init_task;
-                     reaper = reaper->real_parent) {
-                        if (same_thread_group(reaper, pid_ns->child_reaper))
-                                break;
-                        if (!reaper->signal->is_child_subreaper)
-                                continue;
-                        thread = reaper;
-                        do {
-                                if (!(thread->flags & PF_EXITING))
-                                        return reaper;
-                        } while_each_thread(reaper, thread);
-                }
        }
        return pid_ns->child_reaper;
@@ -653,6 +821,25 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        if (group_dead)
                kill_orphaned_pgrp(tsk->group_leader, NULL);
+        /* Let father know we died
+         *
+         * Thread signals are configurable, but you aren't going to use
+         * that to send signals to arbitrary processes.
+         * That stops right now.
+         *
+         * If the parent exec id doesn't match the exec id we saved
+         * when we started then we know the parent has changed security
+         * domain.
+         *
+         * If our self_exec id doesn't match our parent_exec_id then
+         * we have changed execution domain as these two values started
+         * the same after a fork.
+         */
+        if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
+            (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
+             tsk->self_exec_id != tsk->parent_exec_id))
+                tsk->exit_signal = SIGCHLD;
        if (unlikely(tsk->ptrace)) {
                int sig = thread_group_leader(tsk) &&
                                thread_group_empty(tsk) &&
@@ -692,9 +879,9 @@ static void check_stack_usage(void)
        spin_lock(&low_water_lock);
        if (free < lowest_to_date) {
-                printk(KERN_WARNING "%s (%d) used greatest stack depth: "
+                printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
-                                "%lu bytes left\n",
+                                "left\n",
-                                current->comm, task_pid_nr(current), free);
+                                current->comm, free);
                lowest_to_date = free;
        }
        spin_unlock(&low_water_lock);
@@ -703,7 +890,7 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
-void do_exit(long code)
+NORET_TYPE void do_exit(long code)
 {
        struct task_struct *tsk = current;
        int group_dead;
@@ -751,6 +938,8 @@ void do_exit(long code)
                schedule();
        }
+        exit_irq_thread();
        exit_signals(tsk);  /* sets PF_EXITING */
        /*
         * tsk->flags are checked in the futex code to protect against
@@ -767,7 +956,7 @@ void do_exit(long code)
        acct_update_integrals(tsk);
        /* sync mm's RSS info before statistics gathering */
        if (tsk->mm)
-                sync_mm_rss(tsk->mm);
+                sync_mm_rss(tsk, tsk->mm);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                hrtimer_cancel(&tsk->signal->real_timer);
@@ -778,7 +967,8 @@ void do_exit(long code)
        acct_collect(code, group_dead);
        if (group_dead)
                tty_audit_exit();
-        audit_free(tsk);
+        if (unlikely(tsk->audit_context))
+                audit_free(tsk);
        tsk->exit_code = code;
        taskstats_exit(tsk, group_dead);
@@ -793,7 +983,6 @@ void do_exit(long code)
        exit_shm(tsk);
        exit_files(tsk);
        exit_fs(tsk);
-        exit_task_work(tsk);
        check_stack_usage();
        exit_thread();
@@ -847,34 +1036,12 @@ void do_exit(long code)
        if (tsk->splice_pipe)
                __free_pipe_info(tsk->splice_pipe);
-        if (tsk->task_frag.page)
-                put_page(tsk->task_frag.page);
        validate_creds_for_do_exit(tsk);
        preempt_disable();
-        if (tsk->nr_dirtied)
-                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
        exit_rcu();
-        /*
-         * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
-         * when the following two conditions become true.
-         *   - There is race condition of mmap_sem (It is acquired by
-         *     exit_mm()), and
-         *   - SMI occurs before setting TASK_RUNINNG.
-         *     (or hypervisor of virtual machine switches to other guest)
-         *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
-         *
-         * To avoid it, we have to wait for releasing tsk->pi_lock which
-         * is held by try_to_wake_up()
-         */
-        smp_mb();
-        raw_spin_unlock_wait(&tsk->pi_lock);
        /* causes final put_task_struct in finish_task_switch(). */
        tsk->state = TASK_DEAD;
-        tsk->flags |= PF_NOFREEZE;      /* tell freezer to ignore us */
        schedule();
        BUG();
        /* Avoid "noreturn function does return".  */
@@ -884,7 +1051,7 @@ void do_exit(long code)
 EXPORT_SYMBOL_GPL(do_exit);
-void complete_and_exit(struct completion *comp, long code)
+NORET_TYPE void complete_and_exit(struct completion *comp, long code)
 {
        if (comp)
                complete(comp);
@@ -903,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
 * Take down every thread in the group.  This is called by fatal signals
 * as well as by sys_exit_group (below).
 */
-void
+NORET_TYPE void
 do_group_exit(int exit_code)
 {
        struct signal_struct *sig = current->signal;
@@ -1024,7 +1191,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
        unsigned long state;
        int retval, status, traced;
        pid_t pid = task_pid_vnr(p);
-        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
+        uid_t uid = __task_cred(p)->uid;
        struct siginfo __user *infop;
        if (!likely(wo->wo_flags & WEXITED))
@@ -1082,17 +1249,27 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                 * as other threads in the parent group can be right
                 * here reaping other children at the same time.
                 *
-                 * We use thread_group_cputime_adjusted() to get times for the thread
+                 * We use thread_group_times() to get times for the thread
                 * group, which consolidates times for all threads in the
                 * group including the group leader.
                 */
-                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+                thread_group_times(p, &tgutime, &tgstime);
                spin_lock_irq(&p->real_parent->sighand->siglock);
                psig = p->real_parent->signal;
                sig = p->signal;
-                psig->cutime += tgutime + sig->cutime;
+                psig->cutime =
-                psig->cstime += tgstime + sig->cstime;
+                        cputime_add(psig->cutime,
-                psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
+                        cputime_add(tgutime,
+                                    sig->cutime));
+                psig->cstime =
+                        cputime_add(psig->cstime,
+                        cputime_add(tgstime,
+                                    sig->cstime));
+                psig->cgtime =
+                        cputime_add(psig->cgtime,
+                        cputime_add(p->gtime,
+                        cputime_add(sig->gtime,
+                                    sig->cgtime)));
                psig->cmin_flt +=
                        p->min_flt + sig->min_flt + sig->cmin_flt;
                psig->cmaj_flt +=
@@ -1237,7 +1414,7 @@ static int wait_task_stopped(struct wait_opts *wo,
        if (!unlikely(wo->wo_flags & WNOWAIT))
                *p_code = 0;
-        uid = from_kuid_munged(current_user_ns(), task_uid(p));
+        uid = task_uid(p);
 unlock_sig:
        spin_unlock_irq(&p->sighand->siglock);
        if (!exit_code)
@@ -1310,7 +1487,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
        }
        if (!unlikely(wo->wo_flags & WNOWAIT))
                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
-        uid = from_kuid_munged(current_user_ns(), task_uid(p));
+        uid = task_uid(p);
        spin_unlock_irq(&p->sighand->siglock);
        pid = task_pid_vnr(p);
diff --git a/kernel/extable.c b/kernel/extable.c
index fe35a634bf7..5339705b824 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -35,16 +35,10 @@ DEFINE_MUTEX(text_mutex);
 extern struct exception_table_entry __start___ex_table[];
 extern struct exception_table_entry __stop___ex_table[];
-/* Cleared by build time tools if the table is already sorted. */
-u32 __initdata main_extable_sort_needed = 1;
 /* Sort the kernel's built-in exception table */
 void __init sort_main_extable(void)
 {
-        if (main_extable_sort_needed)
+        sort_extable(__start___ex_table, __stop___ex_table);
-                sort_extable(__start___ex_table, __stop___ex_table);
-        else
-                pr_notice("__ex_table already sorted, skipping sort\n");
 }
 /* Given an address, look for it in the exception tables. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 65ca6d27f24..f65fa0627c0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,7 +34,6 @@
 #include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
-#include <linux/seccomp.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -48,7 +47,6 @@
 #include <linux/audit.h>
 #include <linux/memcontrol.h>
 #include <linux/ftrace.h>
-#include <linux/proc_fs.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/ksm.h>
@@ -68,8 +66,6 @@
 #include <linux/user-return-notifier.h>
 #include <linux/oom.h>
 #include <linux/khugepaged.h>
-#include <linux/signalfd.h>
-#include <linux/uprobes.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -80,9 +76,6 @@
 #include <trace/events/sched.h>
-#define CREATE_TRACE_POINTS
-#include <trace/events/task.h>
 /*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
@@ -114,69 +107,32 @@ int nr_processes(void)
        return total;
 }
-void __weak arch_release_task_struct(struct task_struct *tsk)
+#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-{
+# define alloc_task_struct_node(node)           \
-}
+                kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
+# define free_task_struct(tsk)                  \
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
+                kmem_cache_free(task_struct_cachep, (tsk))
 static struct kmem_cache *task_struct_cachep;
-static inline struct task_struct *alloc_task_struct_node(int node)
-{
-        return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
-}
-static inline void free_task_struct(struct task_struct *tsk)
-{
-        kmem_cache_free(task_struct_cachep, tsk);
-}
 #endif
-void __weak arch_release_thread_info(struct thread_info *ti)
+#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-{
-}
-#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
-/*
- * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
- * kmemcache based allocator.
- */
-# if THREAD_SIZE >= PAGE_SIZE
 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
                                                  int node)
 {
-        struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
+#ifdef CONFIG_DEBUG_STACK_USAGE
-                                             THREAD_SIZE_ORDER);
+        gfp_t mask = GFP_KERNEL | __GFP_ZERO;
+#else
+        gfp_t mask = GFP_KERNEL;
+#endif
+        struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
        return page ? page_address(page) : NULL;
 }
 static inline void free_thread_info(struct thread_info *ti)
 {
-        free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
-}
-# else
-static struct kmem_cache *thread_info_cache;
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
-                                                  int node)
-{
-        return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
-}
-static void free_thread_info(struct thread_info *ti)
-{
-        kmem_cache_free(thread_info_cache, ti);
 }
-void thread_info_cache_init(void)
-{
-        thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
-                                              THREAD_SIZE, 0, NULL);
-        BUG_ON(thread_info_cache == NULL);
-}
-# endif
 #endif
 /* SLAB cache for signal_struct structures (tsk->signal) */
@@ -197,6 +153,9 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
+/* Notifier list called when a task struct is freed */
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
 static void account_kernel_stack(struct thread_info *ti, int account)
 {
        struct zone *zone = page_zone(virt_to_page(ti));
@@ -206,13 +165,11 @@ static void account_kernel_stack(struct thread_info *ti, int account)
 void free_task(struct task_struct *tsk)
 {
+        prop_local_destroy_single(&tsk->dirties);
        account_kernel_stack(tsk->stack, -1);
-        arch_release_thread_info(tsk->stack);
        free_thread_info(tsk->stack);
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
-        put_seccomp_filter(tsk);
-        arch_release_task_struct(tsk);
        free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -230,27 +187,45 @@ static inline void put_signal_struct(struct signal_struct *sig)
                free_signal_struct(sig);
 }
+int task_free_register(struct notifier_block *n)
+{
+        return atomic_notifier_chain_register(&task_free_notifier, n);
+}
+EXPORT_SYMBOL(task_free_register);
+int task_free_unregister(struct notifier_block *n)
+{
+        return atomic_notifier_chain_unregister(&task_free_notifier, n);
+}
+EXPORT_SYMBOL(task_free_unregister);
 void __put_task_struct(struct task_struct *tsk)
 {
        WARN_ON(!tsk->exit_state);
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
-        security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
+        atomic_notifier_call_chain(&task_free_notifier, 0, tsk);
        if (!profile_handoff_task(tsk))
                free_task(tsk);
 }
 EXPORT_SYMBOL_GPL(__put_task_struct);
-void __init __weak arch_task_cache_init(void) { }
+/*
+ * macro override instead of weak attribute alias, to workaround
+ * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
+ */
+#ifndef arch_task_cache_init
+#define arch_task_cache_init()
+#endif
 void __init fork_init(unsigned long mempages)
 {
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN      L1_CACHE_BYTES
 #endif
@@ -297,20 +272,28 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        int node = tsk_fork_get_node(orig);
        int err;
+        prepare_to_copy(orig);
        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;
        ti = alloc_thread_info_node(tsk, node);
-        if (!ti)
+        if (!ti) {
-                goto free_tsk;
+                free_task_struct(tsk);
+                return NULL;
+        }
        err = arch_dup_task_struct(tsk, orig);
        if (err)
-                goto free_ti;
+                goto out;
        tsk->stack = ti;
+        err = prop_local_init_single(&tsk->dirties);
+        if (err)
+                goto out;
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
@@ -330,15 +313,13 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->btrace_seq = 0;
 #endif
        tsk->splice_pipe = NULL;
-        tsk->task_frag.page = NULL;
        account_kernel_stack(ti, 1);
        return tsk;
-free_ti:
+out:
        free_thread_info(ti);
-free_tsk:
        free_task_struct(tsk);
        return NULL;
 }
@@ -352,10 +333,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        unsigned long charge;
        struct mempolicy *pol;
-        uprobe_start_dup_mmap();
        down_write(&oldmm->mmap_sem);
        flush_cache_dup_mm(oldmm);
-        uprobe_dup_mmap(oldmm, mm);
        /*
         * Not linked in yet - no deadlock potential:
         */
@@ -384,15 +363,16 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                struct file *file;
                if (mpnt->vm_flags & VM_DONTCOPY) {
+                        long pages = vma_pages(mpnt);
+                        mm->total_vm -= pages;
                        vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
-                                                        -vma_pages(mpnt));
+                                                                -pages);
                        continue;
                }
                charge = 0;
                if (mpnt->vm_flags & VM_ACCOUNT) {
-                        unsigned long len = vma_pages(mpnt);
+                        unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+                        if (security_vm_enough_memory(len))
-                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
                }
@@ -424,12 +404,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                                mapping->i_mmap_writable++;
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
-                        if (unlikely(tmp->vm_flags & VM_NONLINEAR))
+                        vma_prio_tree_add(tmp, mpnt);
-                                vma_nonlinear_insert(tmp,
-                                                &mapping->i_mmap_nonlinear);
-                        else
-                                vma_interval_tree_insert_after(tmp, mpnt,
-                                                        &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
                        mutex_unlock(&mapping->i_mmap_mutex);
                }
@@ -470,7 +445,6 @@ out:
        up_write(&mm->mmap_sem);
        flush_tlb_mm(oldmm);
        up_write(&oldmm->mmap_sem);
-        uprobe_end_dup_mmap();
        return retval;
 fail_nomem_anon_vma_fork:
        mpol_put(pol);
@@ -543,6 +517,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        mm->cached_hole_size = ~0UL;
        mm_init_aio(mm);
        mm_init_owner(mm, p);
+        atomic_set(&mm->oom_disable_count, 0);
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
@@ -554,23 +529,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        return NULL;
 }
-static void check_mm(struct mm_struct *mm)
-{
-        int i;
-        for (i = 0; i < NR_MM_COUNTERS; i++) {
-                long x = atomic_long_read(&mm->rss_stat.count[i]);
-                if (unlikely(x))
-                        printk(KERN_ALERT "BUG: Bad rss-counter state "
-                                          "mm:%p idx:%d val:%ld\n", mm, i, x);
-        }
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        VM_BUG_ON(mm->pmd_huge_pte);
-#endif
-}
 /*
 * Allocate and initialize an mm_struct.
 */
@@ -598,7 +556,9 @@ void __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
-        check_mm(mm);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        VM_BUG_ON(mm->pmd_huge_pte);
+#endif
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -611,7 +571,6 @@ void mmput(struct mm_struct *mm)
        might_sleep();
        if (atomic_dec_and_test(&mm->mm_users)) {
-                uprobe_clear_state(mm);
                exit_aio(mm);
                ksm_exit(mm);
                khugepaged_exit(mm); /* must run before exit_mmap */
@@ -622,6 +581,7 @@ void mmput(struct mm_struct *mm)
                        list_del(&mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
+                put_swap_token(mm);
                if (mm->binfmt)
                        module_put(mm->binfmt->module);
                mmdrop(mm);
@@ -629,6 +589,26 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
+/*
+ * We added or removed a vma mapping the executable. The vmas are only mapped
+ * during exec and are not mapped with the mmap system call.
+ * Callers must hold down_write() on the mm's mmap_sem for these
+ */
+void added_exe_file_vma(struct mm_struct *mm)
+{
+        mm->num_exe_file_vmas++;
+}
+void removed_exe_file_vma(struct mm_struct *mm)
+{
+        mm->num_exe_file_vmas--;
+        if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
+                fput(mm->exe_file);
+                mm->exe_file = NULL;
+        }
+}
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
        if (new_exe_file)
@@ -636,13 +616,15 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
        if (mm->exe_file)
                fput(mm->exe_file);
        mm->exe_file = new_exe_file;
+        mm->num_exe_file_vmas = 0;
 }
 struct file *get_mm_exe_file(struct mm_struct *mm)
 {
        struct file *exe_file;
-        /* We need mmap_sem to protect against races with removal of exe_file */
+        /* We need mmap_sem to protect against races with removal of
+         * VM_EXECUTABLE vmas */
        down_read(&mm->mmap_sem);
        exe_file = mm->exe_file;
        if (exe_file)
@@ -684,58 +666,6 @@ struct mm_struct *get_task_mm(struct task_struct *task)
 }
 EXPORT_SYMBOL_GPL(get_task_mm);
-struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
-{
-        struct mm_struct *mm;
-        int err;
-        err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
-        if (err)
-                return ERR_PTR(err);
-        mm = get_task_mm(task);
-        if (mm && mm != current->mm &&
-                        !ptrace_may_access(task, mode)) {
-                mmput(mm);
-                mm = ERR_PTR(-EACCES);
-        }
-        mutex_unlock(&task->signal->cred_guard_mutex);
-        return mm;
-}
-static void complete_vfork_done(struct task_struct *tsk)
-{
-        struct completion *vfork;
-        task_lock(tsk);
-        vfork = tsk->vfork_done;
-        if (likely(vfork)) {
-                tsk->vfork_done = NULL;
-                complete(vfork);
-        }
-        task_unlock(tsk);
-}
-static int wait_for_vfork_done(struct task_struct *child,
-                                struct completion *vfork)
-{
-        int killed;
-        freezer_do_not_count();
-        killed = wait_for_completion_killable(vfork);
-        freezer_count();
-        if (killed) {
-                task_lock(child);
-                child->vfork_done = NULL;
-                task_unlock(child);
-        }
-        put_task_struct(child);
-        return killed;
-}
 /* Please note the differences between mmput and mm_release.
 * mmput is called whenever we stop holding onto a mm_struct,
 * error success whatever.
@@ -751,6 +681,8 @@ static int wait_for_vfork_done(struct task_struct *child,
 */
 void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 {
+        struct completion *vfork_done = tsk->vfork_done;
        /* Get rid of any futexes when releasing the mm */
 #ifdef CONFIG_FUTEX
        if (unlikely(tsk->robust_list)) {
@@ -767,17 +699,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
                exit_pi_state_list(tsk);
 #endif
-        uprobe_free_utask(tsk);
        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);
+        /* notify parent sleeping on vfork() */
+        if (vfork_done) {
+                tsk->vfork_done = NULL;
+                complete(vfork_done);
+        }
        /*
         * If we're exiting normally, clear a user-space tid field if
         * requested.  We leave this alone when dying by signal, to leave
         * the value intact in a core dump, and to save the unnecessary
-         * trouble, say, a killed vfork parent shouldn't touch this mm.
+         * trouble otherwise.  Userland only wants this done for a sys_exit.
-         * Userland only wants this done for a sys_exit.
         */
        if (tsk->clear_child_tid) {
                if (!(tsk->flags & PF_SIGNALED) &&
@@ -792,13 +727,6 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
                }
                tsk->clear_child_tid = NULL;
        }
-        /*
-         * All done, finally we can wake up parent and return this mm to him.
-         * Also kthread_stop() uses this completion for synchronization.
-         */
-        if (tsk->vfork_done)
-                complete_vfork_done(tsk);
 }
 /*
@@ -820,12 +748,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        memcpy(mm, oldmm, sizeof(*mm));
        mm_init_cpumask(mm);
+        /* Initializing for Swap token stuff */
+        mm->token_priority = 0;
+        mm->last_interval = 0;
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        mm->pmd_huge_pte = NULL;
 #endif
-#ifdef CONFIG_NUMA_BALANCING
-        mm->first_nid = NUMA_PTE_SCAN_INIT;
-#endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
@@ -899,6 +829,12 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
                goto fail_nomem;
 good_mm:
+        /* Initializing for Swap token stuff */
+        mm->token_priority = 0;
+        mm->last_interval = 0;
+        if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                atomic_inc(&mm->oom_disable_count);
        tsk->mm = mm;
        tsk->active_mm = mm;
        return 0;
@@ -958,7 +894,6 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 #ifdef CONFIG_BLOCK
        struct io_context *ioc = current->io_context;
-        struct io_context *new_ioc;
        if (!ioc)
                return 0;
@@ -966,15 +901,15 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
         * Share io context with parent, if CLONE_IO is set
         */
        if (clone_flags & CLONE_IO) {
-                ioc_task_link(ioc);
+                tsk->io_context = ioc_task_link(ioc);
-                tsk->io_context = ioc;
+                if (unlikely(!tsk->io_context))
+                        return -ENOMEM;
        } else if (ioprio_valid(ioc->ioprio)) {
-                new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+                tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
-                if (unlikely(!new_ioc))
+                if (unlikely(!tsk->io_context))
                        return -ENOMEM;
-                new_ioc->ioprio = ioc->ioprio;
+                tsk->io_context->ioprio = ioc->ioprio;
-                put_io_context(new_ioc);
        }
 #endif
        return 0;
@@ -999,10 +934,8 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 void __cleanup_sighand(struct sighand_struct *sighand)
 {
-        if (atomic_dec_and_test(&sighand->count)) {
+        if (atomic_dec_and_test(&sighand->count))
-                signalfd_cleanup(sighand);
                kmem_cache_free(sighand_cachep, sighand);
-        }
 }
@@ -1044,6 +977,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        atomic_set(&sig->live, 1);
        atomic_set(&sig->sigcnt, 1);
        init_waitqueue_head(&sig->wait_chldexit);
+        if (clone_flags & CLONE_NEWPID)
+                sig->flags |= SIGNAL_UNKILLABLE;
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
@@ -1061,15 +996,13 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sched_autogroup_fork(sig);
 #ifdef CONFIG_CGROUPS
-        init_rwsem(&sig->group_rwsem);
+        init_rwsem(&sig->threadgroup_fork_lock);
 #endif
+        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
-        sig->has_child_subreaper = current->signal->has_child_subreaper ||
-                                   current->signal->is_child_subreaper;
        mutex_init(&sig->cred_guard_mutex);
        return 0;
@@ -1081,7 +1014,9 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
        new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
        new_flags |= PF_FORKNOEXEC;
+        new_flags |= PF_STARTING;
        p->flags = new_flags;
+        clear_freeze_flag(p);
 }
 SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -1112,8 +1047,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 */
 static void posix_cpu_timers_init(struct task_struct *tsk)
 {
-        tsk->cputime_expires.prof_exp = 0;
+        tsk->cputime_expires.prof_exp = cputime_zero;
-        tsk->cputime_expires.virt_exp = 0;
+        tsk->cputime_expires.virt_exp = cputime_zero;
        tsk->cputime_expires.sched_exp = 0;
        INIT_LIST_HEAD(&tsk->cpu_timers[0]);
        INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1130,6 +1065,7 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
 */
 static struct task_struct *copy_process(unsigned long clone_flags,
                                        unsigned long stack_start,
+                                        struct pt_regs *regs,
                                        unsigned long stack_size,
                                        int __user *child_tidptr,
                                        struct pid *pid,
@@ -1137,6 +1073,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 {
        int retval;
        struct task_struct *p;
+        int cgroup_callbacks_done = 0;
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);
@@ -1166,14 +1103,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                                current->signal->flags & SIGNAL_UNKILLABLE)
                return ERR_PTR(-EINVAL);
-        /*
-         * If the new process will be in a different pid namespace
-         * don't allow the creation of threads.
-         */
-        if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
-            (task_active_pid_ns(current) != current->nsproxy->pid_ns))
-                return ERR_PTR(-EINVAL);
        retval = security_task_create(clone_flags);
        if (retval)
                goto fork_out;
@@ -1184,7 +1113,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                goto fork_out;
        ftrace_graph_init_task(p);
-        get_seccomp_filter(p);
        rt_mutex_init_task(p);
@@ -1228,10 +1156,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        init_sigpending(&p->pending);
-        p->utime = p->stime = p->gtime = 0;
+        p->utime = cputime_zero;
-        p->utimescaled = p->stimescaled = 0;
+        p->stime = cputime_zero;
+        p->gtime = cputime_zero;
+        p->utimescaled = cputime_zero;
+        p->stimescaled = cputime_zero;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-        p->prev_cputime.utime = p->prev_cputime.stime = 0;
+        p->prev_utime = cputime_zero;
+        p->prev_stime = cputime_zero;
 #endif
 #if defined(SPLIT_RSS_COUNTING)
        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1250,7 +1182,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->io_context = NULL;
        p->audit_context = NULL;
        if (clone_flags & CLONE_THREAD)
-                threadgroup_change_begin(current);
+                threadgroup_fork_read_lock(current);
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
@@ -1264,11 +1196,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
-        seqcount_init(&p->mems_allowed_seq);
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
        p->irq_events = 0;
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        p->hardirqs_enabled = 1;
+#else
        p->hardirqs_enabled = 0;
+#endif
        p->hardirq_enable_ip = 0;
        p->hardirq_enable_event = 0;
        p->hardirq_disable_ip = _THIS_IP_;
@@ -1290,7 +1225,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
 #endif
-#ifdef CONFIG_MEMCG
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
        p->memcg_batch.do_batch = 0;
        p->memcg_batch.memcg = NULL;
 #endif
@@ -1329,7 +1264,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
-        retval = copy_thread(clone_flags, stack_start, stack_size, p);
+        retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
        if (retval)
                goto bad_fork_cleanup_io;
@@ -1361,7 +1296,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->pi_state_list);
        p->pi_state_cache = NULL;
 #endif
-        uprobe_copy_process(p);
        /*
         * sigaltstack should be cleared when sharing the same VM
         */
@@ -1380,27 +1314,22 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        clear_all_latency_tracing(p);
        /* ok, now we should be set up.. */
-        if (clone_flags & CLONE_THREAD)
+        p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
-                p->exit_signal = -1;
-        else if (clone_flags & CLONE_PARENT)
-                p->exit_signal = current->group_leader->exit_signal;
-        else
-                p->exit_signal = (clone_flags & CSIGNAL);
        p->pdeath_signal = 0;
        p->exit_state = 0;
-        p->nr_dirtied = 0;
-        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
-        p->dirty_paused_when = 0;
        /*
         * Ok, make it visible to the rest of the system.
         * We dont wake it up yet.
         */
        p->group_leader = p;
        INIT_LIST_HEAD(&p->thread_group);
-        p->task_works = NULL;
+        /* Now that the task is set up, run cgroup callbacks if
+         * necessary. We need to run them before the task is visible
+         * on the tasklist. */
+        cgroup_fork_callbacks(p);
+        cgroup_callbacks_done = 1;
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
@@ -1444,10 +1373,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
                if (thread_group_leader(p)) {
-                        if (is_child_reaper(pid)) {
+                        if (is_child_reaper(pid))
-                                ns_of_pid(pid)->child_reaper = p;
+                                p->nsproxy->pid_ns->child_reaper = p;
-                                p->signal->flags |= SIGNAL_UNKILLABLE;
-                        }
                        p->signal->leader_pid = pid;
                        p->signal->tty = tty_kref_get(current->signal->tty);
@@ -1467,11 +1394,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        proc_fork_connector(p);
        cgroup_post_fork(p);
        if (clone_flags & CLONE_THREAD)
-                threadgroup_change_end(current);
+                threadgroup_fork_read_unlock(current);
        perf_event_fork(p);
-        trace_task_newtask(p, clone_flags);
        return p;
 bad_fork_free_pid:
@@ -1483,8 +1407,13 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-        if (p->mm)
+        if (p->mm) {
+                task_lock(p);
+                if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                        atomic_dec(&p->mm->oom_disable_count);
+                task_unlock(p);
                mmput(p->mm);
+        }
 bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
@@ -1505,8 +1434,8 @@ bad_fork_cleanup_policy:
 bad_fork_cleanup_cgroup:
 #endif
        if (clone_flags & CLONE_THREAD)
-                threadgroup_change_end(current);
+                threadgroup_fork_read_unlock(current);
-        cgroup_exit(p, 0);
+        cgroup_exit(p, cgroup_callbacks_done);
        delayacct_tsk_free(p);
        module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
@@ -1518,6 +1447,12 @@ fork_out:
        return ERR_PTR(retval);
 }
+noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+{
+        memset(regs, 0, sizeof(struct pt_regs));
+        return regs;
+}
 static inline void init_idle_pids(struct pid_link *links)
 {
        enum pid_type type;
@@ -1531,7 +1466,10 @@ static inline void init_idle_pids(struct pid_link *links)
 struct task_struct * __cpuinit fork_idle(int cpu)
 {
        struct task_struct *task;
-        task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
+        struct pt_regs regs;
+        task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
+                            &init_struct_pid, 0);
        if (!IS_ERR(task)) {
                init_idle_pids(task->pids);
                init_idle(task, cpu);
@@ -1548,6 +1486,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
 */
 long do_fork(unsigned long clone_flags,
              unsigned long stack_start,
+              struct pt_regs *regs,
              unsigned long stack_size,
              int __user *parent_tidptr,
              int __user *child_tidptr)
@@ -1560,9 +1499,15 @@ long do_fork(unsigned long clone_flags,
         * Do some preliminary argument and permissions checking before we
         * actually start allocating stuff
         */
-        if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
+        if (clone_flags & CLONE_NEWUSER) {
-                if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
+                if (clone_flags & CLONE_THREAD)
                        return -EINVAL;
+                /* hopefully this check will go away when userns support is
+                 * complete
+                 */
+                if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
+                                !capable(CAP_SETGID))
+                        return -EPERM;
        }
        /*
@@ -1571,7 +1516,7 @@ long do_fork(unsigned long clone_flags,
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
-        if (!(clone_flags & CLONE_UNTRACED)) {
+        if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1583,7 +1528,7 @@ long do_fork(unsigned long clone_flags,
                        trace = 0;
        }
-        p = copy_process(clone_flags, stack_start, stack_size,
+        p = copy_process(clone_flags, stack_start, regs, stack_size,
                         child_tidptr, NULL, trace);
        /*
         * Do this prior waking up the new thread - the thread pointer
@@ -1602,9 +1547,18 @@ long do_fork(unsigned long clone_flags,
                if (clone_flags & CLONE_VFORK) {
                        p->vfork_done = &vfork;
                        init_completion(&vfork);
-                        get_task_struct(p);
                }
+                audit_finish_fork(p);
+                /*
+                 * We set PF_STARTING at creation in case tracing wants to
+                 * use this to distinguish a fully live task from one that
+                 * hasn't finished SIGSTOP raising yet.  Now we clear it
+                 * and set the child going.
+                 */
+                p->flags &= ~PF_STARTING;
                wake_up_new_task(p);
                /* forking complete and child started to run, tell ptracer */
@@ -1612,8 +1566,10 @@ long do_fork(unsigned long clone_flags,
                        ptrace_event(trace, nr);
                if (clone_flags & CLONE_VFORK) {
-                        if (!wait_for_vfork_done(p, &vfork))
+                        freezer_do_not_count();
-                                ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
+                        wait_for_completion(&vfork);
+                        freezer_count();
+                        ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
                }
        } else {
                nr = PTR_ERR(p);
@@ -1621,58 +1577,6 @@ long do_fork(unsigned long clone_flags,
        return nr;
 }
-/*
- * Create a kernel thread.
- */
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
-{
-        return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
-                (unsigned long)arg, NULL, NULL);
-}
-#ifdef __ARCH_WANT_SYS_FORK
-SYSCALL_DEFINE0(fork)
-{
-#ifdef CONFIG_MMU
-        return do_fork(SIGCHLD, 0, 0, NULL, NULL);
-#else
-        /* can not support in nommu mode */
-        return(-EINVAL);
-#endif
-}
-#endif
-#ifdef __ARCH_WANT_SYS_VFORK
-SYSCALL_DEFINE0(vfork)
-{
-        return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 
-                        0, NULL, NULL);
-}
-#endif
-#ifdef __ARCH_WANT_SYS_CLONE
-#ifdef CONFIG_CLONE_BACKWARDS
-SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
-                 int __user *, parent_tidptr,
-                 int, tls_val,
-                 int __user *, child_tidptr)
-#elif defined(CONFIG_CLONE_BACKWARDS2)
-SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
-                 int __user *, parent_tidptr,
-                 int __user *, child_tidptr,
-                 int, tls_val)
-#else
-SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
-                 int __user *, parent_tidptr,
-                 int __user *, child_tidptr,
-                 int, tls_val)
-#endif
-{
-        return do_fork(clone_flags, newsp, 0,
-                parent_tidptr, child_tidptr);
-}
-#endif
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
@@ -1722,8 +1626,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
 {
        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
-                                CLONE_NEWUSER|CLONE_NEWPID))
                return -EINVAL;
        /*
         * Not implemented, but pretend it works if there is nothing to
@@ -1790,40 +1693,19 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 {
        struct fs_struct *fs, *new_fs = NULL;
        struct files_struct *fd, *new_fd = NULL;
-        struct cred *new_cred = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
        int err;
-        /*
+        err = check_unshare_flags(unshare_flags);
-         * If unsharing a user namespace must also unshare the thread.
+        if (err)
-         */
+                goto bad_unshare_out;
-        if (unshare_flags & CLONE_NEWUSER)
-                unshare_flags |= CLONE_THREAD;
-        /*
-         * If unsharing a pid namespace must also unshare the thread.
-         */
-        if (unshare_flags & CLONE_NEWPID)
-                unshare_flags |= CLONE_THREAD;
-        /*
-         * If unsharing a thread from a thread group, must also unshare vm.
-         */
-        if (unshare_flags & CLONE_THREAD)
-                unshare_flags |= CLONE_VM;
-        /*
-         * If unsharing vm, must also unshare signal handlers.
-         */
-        if (unshare_flags & CLONE_VM)
-                unshare_flags |= CLONE_SIGHAND;
        /*
         * If unsharing namespace, must also unshare filesystem information.
         */
        if (unshare_flags & CLONE_NEWNS)
                unshare_flags |= CLONE_FS;
-        err = check_unshare_flags(unshare_flags);
-        if (err)
-                goto bad_unshare_out;
        /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
@@ -1837,15 +1719,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
        err = unshare_fd(unshare_flags, &new_fd);
        if (err)
                goto bad_unshare_cleanup_fs;
-        err = unshare_userns(unshare_flags, &new_cred);
+        err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
        if (err)
                goto bad_unshare_cleanup_fd;
-        err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
-                                         new_cred, new_fs);
-        if (err)
-                goto bad_unshare_cleanup_cred;
-        if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
+        if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1878,20 +1756,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                }
                task_unlock(current);
-                if (new_cred) {
-                        /* Install the new user namespace */
-                        commit_creds(new_cred);
-                        new_cred = NULL;
-                }
        }
        if (new_nsproxy)
                put_nsproxy(new_nsproxy);
-bad_unshare_cleanup_cred:
-        if (new_cred)
-                put_cred(new_cred);
 bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index c38893b0efb..7b01de98bb6 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -6,159 +6,161 @@
 #include <linux/interrupt.h>
 #include <linux/suspend.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
-#include <linux/kthread.h>
-/* total number of freezing conditions in effect */
+/*
-atomic_t system_freezing_cnt = ATOMIC_INIT(0);
+ * freezing is complete, mark current process as frozen
-EXPORT_SYMBOL(system_freezing_cnt);
-/* indicate whether PM freezing is in effect, protected by pm_mutex */
-bool pm_freezing;
-bool pm_nosig_freezing;
-/* protects freezing and frozen transitions */
-static DEFINE_SPINLOCK(freezer_lock);
-/**
- * freezing_slow_path - slow path for testing whether a task needs to be frozen
- * @p: task to be tested
- *
- * This function is called by freezing() if system_freezing_cnt isn't zero
- * and tests whether @p needs to enter and stay in frozen state.  Can be
- * called under any context.  The freezers are responsible for ensuring the
- * target tasks see the updated state.
 */
-bool freezing_slow_path(struct task_struct *p)
+static inline void frozen_process(void)
 {
-        if (p->flags & PF_NOFREEZE)
+        if (!unlikely(current->flags & PF_NOFREEZE)) {
-                return false;
+                current->flags |= PF_FROZEN;
+                smp_wmb();
-        if (pm_nosig_freezing || cgroup_freezing(p))
+        }
-                return true;
+        clear_freeze_flag(current);
-        if (pm_freezing && !(p->flags & PF_KTHREAD))
-                return true;
-        return false;
 }
-EXPORT_SYMBOL(freezing_slow_path);
 /* Refrigerator is place where frozen processes are stored :-). */
-bool __refrigerator(bool check_kthr_stop)
+void refrigerator(void)
 {
        /* Hmm, should we be allowed to suspend when there are realtime
           processes around? */
-        bool was_frozen = false;
+        long save;
-        long save = current->state;
+        task_lock(current);
+        if (freezing(current)) {
+                frozen_process();
+                task_unlock(current);
+        } else {
+                task_unlock(current);
+                return;
+        }
+        save = current->state;
        pr_debug("%s entered refrigerator\n", current->comm);
-        for (;;) {
+        spin_lock_irq(&current->sighand->siglock);
-                set_current_state(TASK_UNINTERRUPTIBLE);
+        recalc_sigpending(); /* We sent fake signal, clean it up */
+        spin_unlock_irq(&current->sighand->siglock);
-                spin_lock_irq(&freezer_lock);
+        /* prevent accounting of that task to load */
-                current->flags |= PF_FROZEN;
+        current->flags |= PF_FREEZING;
-                if (!freezing(current) ||
-                    (check_kthr_stop && kthread_should_stop()))
-                        current->flags &= ~PF_FROZEN;
-                spin_unlock_irq(&freezer_lock);
-                if (!(current->flags & PF_FROZEN))
+        for (;;) {
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                if (!frozen(current))
                        break;
-                was_frozen = true;
                schedule();
        }
-        pr_debug("%s left refrigerator\n", current->comm);
+        /* Remove the accounting blocker */
+        current->flags &= ~PF_FREEZING;
-        /*
-         * Restore saved task state before returning.  The mb'd version
-         * needs to be used; otherwise, it might silently break
-         * synchronization which depends on ordered task state change.
-         */
-        set_current_state(save);
-        return was_frozen;
+        pr_debug("%s left refrigerator\n", current->comm);
+        __set_current_state(save);
 }
-EXPORT_SYMBOL(__refrigerator);
+EXPORT_SYMBOL(refrigerator);
 static void fake_signal_wake_up(struct task_struct *p)
 {
        unsigned long flags;
-        if (lock_task_sighand(p, &flags)) {
+        spin_lock_irqsave(&p->sighand->siglock, flags);
-                signal_wake_up(p, 0);
+        signal_wake_up(p, 0);
-                unlock_task_sighand(p, &flags);
+        spin_unlock_irqrestore(&p->sighand->siglock, flags);
-        }
 }
 /**
- * freeze_task - send a freeze request to given task
+ *      freeze_task - send a freeze request to given task
- * @p: task to send the request to
+ *      @p: task to send the request to
+ *      @sig_only: if set, the request will only be sent if the task has the
+ *              PF_FREEZER_NOSIG flag unset
+ *      Return value: 'false', if @sig_only is set and the task has
+ *              PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
 *
- * If @p is freezing, the freeze request is sent either by sending a fake
+ *      The freeze request is sent by setting the tasks's TIF_FREEZE flag and
- * signal (if it's not a kernel thread) or waking it up (if it's a kernel
+ *      either sending a fake signal to it or waking it up, depending on whether
- * thread).
+ *      or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
- *
+ *      has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
- * RETURNS:
+ *      TIF_FREEZE flag will not be set.
- * %false, if @p is not freezing or already frozen; %true, otherwise
 */
-bool freeze_task(struct task_struct *p)
+bool freeze_task(struct task_struct *p, bool sig_only)
 {
-        unsigned long flags;
+        /*
+         * We first check if the task is freezing and next if it has already
-        spin_lock_irqsave(&freezer_lock, flags);
+         * been frozen to avoid the race with frozen_process() which first marks
-        if (!freezing(p) || frozen(p)) {
+         * the task as frozen and next clears its TIF_FREEZE.
-                spin_unlock_irqrestore(&freezer_lock, flags);
+         */
-                return false;
+        if (!freezing(p)) {
+                smp_rmb();
+                if (frozen(p))
+                        return false;
+                if (!sig_only || should_send_signal(p))
+                        set_freeze_flag(p);
+                else
+                        return false;
        }
-        if (!(p->flags & PF_KTHREAD))
+        if (should_send_signal(p)) {
                fake_signal_wake_up(p);
-        else
+                /*
+                 * fake_signal_wake_up() goes through p's scheduler
+                 * lock and guarantees that TASK_STOPPED/TRACED ->
+                 * TASK_RUNNING transition can't race with task state
+                 * testing in try_to_freeze_tasks().
+                 */
+        } else if (sig_only) {
+                return false;
+        } else {
                wake_up_state(p, TASK_INTERRUPTIBLE);
+        }
-        spin_unlock_irqrestore(&freezer_lock, flags);
        return true;
 }
-void __thaw_task(struct task_struct *p)
+void cancel_freezing(struct task_struct *p)
 {
        unsigned long flags;
-        /*
+        if (freezing(p)) {
-         * Clear freezing and kick @p if FROZEN.  Clearing is guaranteed to
+                pr_debug("  clean up: %s\n", p->comm);
-         * be visible to @p as waking up implies wmb.  Waking up inside
+                clear_freeze_flag(p);
-         * freezer_lock also prevents wakeups from leaking outside
+                spin_lock_irqsave(&p->sighand->siglock, flags);
-         * refrigerator.
+                recalc_sigpending_and_wake(p);
-         */
+                spin_unlock_irqrestore(&p->sighand->siglock, flags);
-        spin_lock_irqsave(&freezer_lock, flags);
+        }
-        if (frozen(p))
-                wake_up_process(p);
-        spin_unlock_irqrestore(&freezer_lock, flags);
 }
-/**
+static int __thaw_process(struct task_struct *p)
- * set_freezable - make %current freezable
+{
+        if (frozen(p)) {
+                p->flags &= ~PF_FROZEN;
+                return 1;
+        }
+        clear_freeze_flag(p);
+        return 0;
+}
+/*
+ * Wake up a frozen process
 *
- * Mark %current freezable and enter refrigerator if necessary.
+ * task_lock() is needed to prevent the race with refrigerator() which may
+ * occur if the freezing of tasks fails.  Namely, without the lock, if the
+ * freezing of tasks failed, thaw_tasks() might have run before a task in
+ * refrigerator() could call frozen_process(), in which case the task would be
+ * frozen and no one would thaw it.
 */
-bool set_freezable(void)
+int thaw_process(struct task_struct *p)
 {
-        might_sleep();
+        task_lock(p);
+        if (__thaw_process(p) == 1) {
-        /*
+                task_unlock(p);
-         * Modify flags while holding freezer_lock.  This ensures the
+                wake_up_process(p);
-         * freezer notices that we aren't frozen yet or the freezing
+                return 1;
-         * condition is visible to try_to_freeze() below.
+        }
-         */
+        task_unlock(p);
-        spin_lock_irq(&freezer_lock);
+        return 0;
-        current->flags &= ~PF_NOFREEZE;
-        spin_unlock_irq(&freezer_lock);
-        return try_to_freeze();
 }
-EXPORT_SYMBOL(set_freezable);
+EXPORT_SYMBOL(thaw_process);
diff --git a/kernel/futex.c b/kernel/futex.c
index 19eb089ca00..e6160fa842e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -55,11 +55,10 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/magic.h>
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
-#include <linux/ptrace.h>
 #include <asm/futex.h>
@@ -716,7 +715,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                struct futex_pi_state **ps,
                                struct task_struct *task, int set_waiters)
 {
-        int lock_taken, ret, force_take = 0;
+        int lock_taken, ret, ownerdied = 0;
        u32 uval, newval, curval, vpid = task_pid_vnr(task);
 retry:
@@ -755,15 +754,17 @@ retry:
        newval = curval | FUTEX_WAITERS;
        /*
-         * Should we force take the futex? See below.
+         * There are two cases, where a futex might have no owner (the
+         * owner TID is 0): OWNER_DIED. We take over the futex in this
+         * case. We also do an unconditional take over, when the owner
+         * of the futex died.
+         *
+         * This is safe as we are protected by the hash bucket lock !
         */
-        if (unlikely(force_take)) {
+        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
-                /*
+                /* Keep the OWNER_DIED bit */
-                 * Keep the OWNER_DIED and the WAITERS bit and set the
-                 * new TID value.
-                 */
                newval = (curval & ~FUTEX_TID_MASK) | vpid;
-                force_take = 0;
+                ownerdied = 0;
                lock_taken = 1;
        }
@@ -773,7 +774,7 @@ retry:
                goto retry;
        /*
-         * We took the lock due to forced take over.
+         * We took the lock due to owner died take over.
         */
        if (unlikely(lock_taken))
                return 1;
@@ -788,25 +789,20 @@ retry:
                switch (ret) {
                case -ESRCH:
                        /*
-                         * We failed to find an owner for this
+                         * No owner found for this futex. Check if the
-                         * futex. So we have no pi_state to block
+                         * OWNER_DIED bit is set to figure out whether
-                         * on. This can happen in two cases:
+                         * this is a robust futex or not.
-                         *
-                         * 1) The owner died
-                         * 2) A stale FUTEX_WAITERS bit
-                         *
-                         * Re-read the futex value.
                         */
                        if (get_futex_value_locked(&curval, uaddr))
                                return -EFAULT;
                        /*
-                         * If the owner died or we have a stale
+                         * We simply start over in case of a robust
-                         * WAITERS bit the owner TID in the user space
+                         * futex. The code above will take the futex
-                         * futex is 0.
+                         * and return happy.
                         */
-                        if (!(curval & FUTEX_TID_MASK)) {
+                        if (curval & FUTEX_OWNER_DIED) {
-                                force_take = 1;
+                                ownerdied = 1;
                                goto retry;
                        }
                default:
@@ -843,9 +839,6 @@ static void wake_futex(struct futex_q *q)
 {
        struct task_struct *p = q->task;
-        if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
-                return;
        /*
         * We set q->lock_ptr = NULL _before_ we wake up the task. If
         * a non-futex wake up happens on another CPU then the task
@@ -873,7 +866,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 {
        struct task_struct *new_owner;
        struct futex_pi_state *pi_state = this->pi_state;
-        u32 uninitialized_var(curval), newval;
+        u32 curval, newval;
        if (!pi_state)
                return -EINVAL;
@@ -935,7 +928,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 {
-        u32 uninitialized_var(oldval);
+        u32 oldval;
        /*
         * There is no waiter, so we unlock the futex. The owner died
@@ -1081,10 +1074,6 @@ retry_private:
        plist_for_each_entry_safe(this, next, head, list) {
                if (match_futex (&this->key, &key1)) {
-                        if (this->pi_state || this->rt_waiter) {
-                                ret = -EINVAL;
-                                goto out_unlock;
-                        }
                        wake_futex(this);
                        if (++ret >= nr_wake)
                                break;
@@ -1097,10 +1086,6 @@ retry_private:
                op_ret = 0;
                plist_for_each_entry_safe(this, next, head, list) {
                        if (match_futex (&this->key, &key2)) {
-                                if (this->pi_state || this->rt_waiter) {
-                                        ret = -EINVAL;
-                                        goto out_unlock;
-                                }
                                wake_futex(this);
                                if (++op_ret >= nr_wake2)
                                        break;
@@ -1109,7 +1094,6 @@ retry_private:
                ret += op_ret;
        }
-out_unlock:
        double_unlock_hb(hb1, hb2);
 out_put_keys:
        put_futex_key(&key2);
@@ -1399,13 +1383,9 @@ retry_private:
                /*
                 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
                 * be paired with each other and no other futex ops.
-                 *
-                 * We should never be requeueing a futex_q with a pi_state,
-                 * which is awaiting a futex_unlock_pi().
                 */
                if ((requeue_pi && !this->rt_waiter) ||
-                    (!requeue_pi && this->rt_waiter) ||
+                    (!requeue_pi && this->rt_waiter)) {
-                    this->pi_state) {
                        ret = -EINVAL;
                        break;
                }
@@ -1608,7 +1588,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
        struct task_struct *oldowner = pi_state->owner;
-        u32 uval, uninitialized_var(curval), newval;
+        u32 uval, curval, newval;
        int ret;
        /* Owner died? */
@@ -1825,7 +1805,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 *
 * Returns:
 *  0 - uaddr contains val and hb has been locked
- * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
 */
 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                           struct futex_q *q, struct futex_hash_bucket **hb)
@@ -2250,11 +2230,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 * @uaddr2:     the pi futex we will take prior to returning to user-space
 *
 * The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
+ * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
- * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * complete the acquisition of the rt_mutex prior to returning to userspace.
- * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
+ * This ensures the rt_mutex maintains an owner when it has waiters; without
- * without one, the pi logic would not know which task to boost/deboost, if
+ * one, the pi logic wouldn't know which task to boost/deboost, if there was a
- * there was a need to.
+ * need to.
 *
 * We call schedule in futex_wait_queue_me() when we enqueue and return there
 * via the following:
@@ -2291,9 +2271,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        struct futex_q q = futex_q_init;
        int res, ret;
-        if (uaddr == uaddr2)
-                return -EINVAL;
        if (!bitset)
                return -EINVAL;
@@ -2365,7 +2342,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
                 * the pi_state.
                 */
-                WARN_ON(!q.pi_state);
+                WARN_ON(!&q.pi_state);
                pi_mutex = &q.pi_state->pi_mutex;
                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
                debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2392,7 +2369,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
         * fault, unlock the rt_mutex and return the fault to userspace.
         */
        if (ret == -EFAULT) {
-                if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
+                if (rt_mutex_owner(pi_mutex) == current)
                        rt_mutex_unlock(pi_mutex);
        } else if (ret == -EINTR) {
                /*
@@ -2466,31 +2443,40 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 {
        struct robust_list_head __user *head;
        unsigned long ret;
-        struct task_struct *p;
+        const struct cred *cred = current_cred(), *pcred;
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
-        WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
-        rcu_read_lock();
-        ret = -ESRCH;
        if (!pid)
-                p = current;
+                head = current->robust_list;
        else {
+                struct task_struct *p;
+                ret = -ESRCH;
+                rcu_read_lock();
                p = find_task_by_vpid(pid);
                if (!p)
                        goto err_unlock;
+                ret = -EPERM;
+                pcred = __task_cred(p);
+                /* If victim is in different user_ns, then uids are not
+                   comparable, so we must have CAP_SYS_PTRACE */
+                if (cred->user->user_ns != pcred->user->user_ns) {
+                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+                                goto err_unlock;
+                        goto ok;
+                }
+                /* If victim is in same user_ns, then uids are comparable */
+                if (cred->euid != pcred->euid &&
+                    cred->euid != pcred->uid &&
+                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+                        goto err_unlock;
+ok:
+                head = p->robust_list;
+                rcu_read_unlock();
        }
-        ret = -EPERM;
-        if (!ptrace_may_access(p, PTRACE_MODE_READ))
-                goto err_unlock;
-        head = p->robust_list;
-        rcu_read_unlock();
        if (put_user(sizeof(*head), len_ptr))
                return -EFAULT;
        return put_user(head, head_ptr);
@@ -2507,7 +2493,7 @@ err_unlock:
 */
 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
-        u32 uval, uninitialized_var(nval), mval;
+        u32 uval, nval, mval;
 retry:
        if (get_user(uval, uaddr))
@@ -2642,7 +2628,7 @@ void exit_robust_list(struct task_struct *curr)
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
 {
-        int cmd = op & FUTEX_CMD_MASK;
+        int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
        unsigned int flags = 0;
        if (!(op & FUTEX_PRIVATE_FLAG))
@@ -2655,44 +2641,49 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
        }
        switch (cmd) {
-        case FUTEX_LOCK_PI:
-        case FUTEX_UNLOCK_PI:
-        case FUTEX_TRYLOCK_PI:
-        case FUTEX_WAIT_REQUEUE_PI:
-        case FUTEX_CMP_REQUEUE_PI:
-                if (!futex_cmpxchg_enabled)
-                        return -ENOSYS;
-        }
-        switch (cmd) {
        case FUTEX_WAIT:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAIT_BITSET:
-                return futex_wait(uaddr, flags, val, timeout, val3);
+                ret = futex_wait(uaddr, flags, val, timeout, val3);
+                break;
        case FUTEX_WAKE:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAKE_BITSET:
-                return futex_wake(uaddr, flags, val, val3);
+                ret = futex_wake(uaddr, flags, val, val3);
+                break;
        case FUTEX_REQUEUE:
-                return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
+                break;
        case FUTEX_CMP_REQUEUE:
-                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
+                break;
        case FUTEX_WAKE_OP:
-                return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
+                ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
+                break;
        case FUTEX_LOCK_PI:
-                return futex_lock_pi(uaddr, flags, val, timeout, 0);
+                if (futex_cmpxchg_enabled)
+                        ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
+                break;
        case FUTEX_UNLOCK_PI:
-                return futex_unlock_pi(uaddr, flags);
+                if (futex_cmpxchg_enabled)
+                        ret = futex_unlock_pi(uaddr, flags);
+                break;
        case FUTEX_TRYLOCK_PI:
-                return futex_lock_pi(uaddr, flags, 0, timeout, 1);
+                if (futex_cmpxchg_enabled)
+                        ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
+                break;
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
-                return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+                ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
-                                             uaddr2);
+                                            uaddr2);
+                break;
        case FUTEX_CMP_REQUEUE_PI:
-                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+                break;
+        default:
+                ret = -ENOSYS;
        }
-        return -ENOSYS;
+        return ret;
 }
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 83e368b005f..5f9e689dc8f 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -10,7 +10,6 @@
 #include <linux/compat.h>
 #include <linux/nsproxy.h>
 #include <linux/futex.h>
-#include <linux/ptrace.h>
 #include <asm/uaccess.h>
@@ -137,31 +136,40 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 {
        struct compat_robust_list_head __user *head;
        unsigned long ret;
-        struct task_struct *p;
+        const struct cred *cred = current_cred(), *pcred;
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
-        WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
-        rcu_read_lock();
-        ret = -ESRCH;
        if (!pid)
-                p = current;
+                head = current->compat_robust_list;
        else {
+                struct task_struct *p;
+                ret = -ESRCH;
+                rcu_read_lock();
                p = find_task_by_vpid(pid);
                if (!p)
                        goto err_unlock;
+                ret = -EPERM;
+                pcred = __task_cred(p);
+                /* If victim is in different user_ns, then uids are not
+                   comparable, so we must have CAP_SYS_PTRACE */
+                if (cred->user->user_ns != pcred->user->user_ns) {
+                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+                                goto err_unlock;
+                        goto ok;
+                }
+                /* If victim is in same user_ns, then uids are comparable */
+                if (cred->euid != pcred->euid &&
+                    cred->euid != pcred->uid &&
+                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+                        goto err_unlock;
+ok:
+                head = p->compat_robust_list;
+                rcu_read_unlock();
        }
-        ret = -EPERM;
-        if (!ptrace_may_access(p, PTRACE_MODE_READ))
-                goto err_unlock;
-        head = p->compat_robust_list;
-        rcu_read_unlock();
        if (put_user(sizeof(*head), len_ptr))
                return -EFAULT;
        return put_user(ptr_to_compat(head), head_ptr);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index a92028196cc..824b741925b 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
 config GCOV_PROFILE_ALL
        bool "Profile entire Kernel"
        depends on GCOV_KERNEL
-        depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+        depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE || ARM
        default n
        ---help---
        This options activates profiling for the entire kernel.
@@ -46,4 +46,10 @@ config GCOV_PROFILE_ALL
        larger and run slower. Also be sure to exclude files from profiling
        which are not linked to the kernel image to prevent linker errors.
+config GCOV_CTORS
+        string
+        depends on CONSTRUCTORS
+        default ".init_array" if ARM && AEABI
+        default ".ctors"
 endmenu
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index ae5bb426003..d753d1152b7 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -297,16 +297,30 @@ void gcov_iter_start(struct gcov_iterator *iter)
 }
 /* Mapping of logical record number to actual file content. */
-#define RECORD_FILE_MAGIC       0
+#define RECORD_FILE_MAGIC               0
-#define RECORD_GCOV_VERSION     1
+#define RECORD_GCOV_VERSION             1
-#define RECORD_TIME_STAMP       2
+#define RECORD_TIME_STAMP               2
-#define RECORD_FUNCTION_TAG     3
+#define RECORD_FUNCTION_TAG             3
-#define RECORD_FUNCTON_TAG_LEN  4
+#define RECORD_FUNCTON_TAG_LEN          4
-#define RECORD_FUNCTION_IDENT   5
+#define RECORD_FUNCTION_IDENT           5
-#define RECORD_FUNCTION_CHECK   6
+#define RECORD_FUNCTION_CHECK_LINE      6
-#define RECORD_COUNT_TAG        7
+#define RECORD_FUNCTION_CHECK_CFG       7
-#define RECORD_COUNT_LEN        8
+#define RECORD_FUNCTION_NAME_LEN        8
-#define RECORD_COUNT            9
+#define RECORD_FUNCTION_NAME            9
+#define RECORD_COUNT_TAG                10
+#define RECORD_COUNT_LEN                11
+#define RECORD_COUNT                    12
+/* Return length of string encoded in GCOV format. */
+static size_t
+sizeof_str(const char *str)
+{
+        size_t len;
+        len = (str) ? strlen(str) : 0;
+        if (len == 0)
+                return 1;
+        return 1 + ((len + 4) >> 2);
+}
 /**
 * gcov_iter_next - advance file iterator to next logical record
@@ -323,6 +337,9 @@ int gcov_iter_next(struct gcov_iterator *iter)
        case RECORD_FUNCTON_TAG_LEN:
        case RECORD_FUNCTION_IDENT:
        case RECORD_COUNT_TAG:
+        case RECORD_FUNCTION_CHECK_LINE:
+        case RECORD_FUNCTION_CHECK_CFG:
+        case RECORD_FUNCTION_NAME_LEN:
                /* Advance to next record */
                iter->record++;
                break;
@@ -332,7 +349,7 @@ int gcov_iter_next(struct gcov_iterator *iter)
                /* fall through */
        case RECORD_COUNT_LEN:
                if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
-                        iter->record = 9;
+                        iter->record = 12;
                        break;
                }
                /* Advance to next counter type */
@@ -340,9 +357,9 @@ int gcov_iter_next(struct gcov_iterator *iter)
                iter->count = 0;
                iter->type++;
                /* fall through */
-        case RECORD_FUNCTION_CHECK:
+        case RECORD_FUNCTION_NAME:
                if (iter->type < iter->num_types) {
-                        iter->record = 7;
+                        iter->record = 10;
                        break;
                }
                /* Advance to next function */
@@ -395,6 +412,34 @@ static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
        data[1] = (v >> 32);
        return seq_write(seq, data, sizeof(data));
 }
+/**
+ * seq_write_gcov_str - write string in gcov format to seq_file
+ * @seq: seq_file handle
+ * @str: string to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first.
+ */
+static int seq_write_gcov_str(struct seq_file *seq, const char *str)
+{
+        if (str) {
+                size_t len;
+                int str_off;
+                u32 data;
+                len = strlen(str);
+                for (str_off = 0; str_off < (sizeof_str(str) - 2) ; str_off++) {
+                        memcpy(&data, (str + str_off * 4), 4);
+                        seq_write(seq, &data, sizeof(data));
+                }
+                data = 0;
+                memcpy(&data, (str + str_off * 4), (len - str_off * 4));
+                return seq_write(seq, &data, sizeof(data));
+        } else {
+                return 0;
+        }
+}
 /**
 * gcov_iter_write - write data for current pos to seq_file
@@ -421,13 +466,24 @@ int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
                rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
                break;
        case RECORD_FUNCTON_TAG_LEN:
-                rc = seq_write_gcov_u32(seq, 2);
+                rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION_LENGTH +
+                        (sizeof_str(get_func(iter)->name)));
                break;
        case RECORD_FUNCTION_IDENT:
                rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
                break;
-        case RECORD_FUNCTION_CHECK:
+        case RECORD_FUNCTION_CHECK_LINE:
-                rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
+                rc = seq_write_gcov_u32(seq, get_func(iter)->lineno_checksum);
+                break;
+        case RECORD_FUNCTION_CHECK_CFG:
+                rc = seq_write_gcov_u32(seq, get_func(iter)->cfg_checksum);
+                break;
+        case RECORD_FUNCTION_NAME_LEN:
+                rc = seq_write_gcov_u32(seq,
+                        (sizeof_str(get_func(iter)->name) - 1));
+                break;
+        case RECORD_FUNCTION_NAME:
+                rc = seq_write_gcov_str(seq, get_func(iter)->name);
                break;
        case RECORD_COUNT_TAG:
                rc = seq_write_gcov_u32(seq,
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 060073ebf7a..040c6980df0 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -21,9 +21,10 @@
 * gcc and need to be kept as close to the original definition as possible to
 * remain compatible.
 */
-#define GCOV_COUNTERS           5
+#define GCOV_COUNTERS           10
 #define GCOV_DATA_MAGIC         ((unsigned int) 0x67636461)
 #define GCOV_TAG_FUNCTION       ((unsigned int) 0x01000000)
+#define GCOV_TAG_FUNCTION_LENGTH 3
 #define GCOV_TAG_COUNTER_BASE   ((unsigned int) 0x01a10000)
 #define GCOV_TAG_FOR_COUNTER(count)                                     \
        (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
@@ -34,10 +35,38 @@ typedef long gcov_type;
 typedef long long gcov_type;
 #endif
+/*
+ * Source module info. The data structure is used in both runtime and
+ * profile-use phase.
+ */
+struct gcov_module_info {
+        unsigned int ident;
+/*
+ * This is overloaded to mean two things:
+ * (1) means FDO/LIPO in instrumented binary.
+ * (2) means IS_PRIMARY in persistent file or memory copy used in profile-use.
+ */
+        unsigned int is_primary;
+        unsigned int is_exported;
+        unsigned int lang;
+        char *da_filename;
+        char *source_filename;
+        unsigned int num_quote_paths;
+        unsigned int num_bracket_paths;
+        unsigned int num_cpp_defines;
+        unsigned int num_cpp_includes;
+        unsigned int num_cl_args;
+        char *string_array[1];
+};
 /**
 * struct gcov_fn_info - profiling meta data per function
 * @ident: object file-unique function identifier
- * @checksum: function checksum
+ * @lineno_checksum: function lineno checksum
+ * @cfg_checksum: function cfg checksum
+ * @dc_offset: direct call offset
+ * @name: function name
 * @n_ctrs: number of values per counter type belonging to this function
 *
 * This data is generated by gcc during compilation and doesn't change
@@ -45,7 +74,10 @@ typedef long long gcov_type;
 */
 struct gcov_fn_info {
        unsigned int ident;
-        unsigned int checksum;
+        unsigned int lineno_checksum;
+        unsigned int cfg_checksum;
+        unsigned int dc_offset;
+        const char   *name;
        unsigned int n_ctrs[0];
 };
@@ -67,9 +99,11 @@ struct gcov_ctr_info {
 /**
 * struct gcov_info - profiling data per object file
 * @version: gcov version magic indicating the gcc version used for compilation
+ * @modinfo: additional module information
 * @next: list head for a singly-linked list
 * @stamp: time stamp
 * @filename: name of the associated gcov data file
+ * @eof_pos: end position of profile data
 * @n_functions: number of instrumented functions
 * @functions: function data
 * @ctr_mask: mask specifying which counter types are active
@@ -80,9 +114,11 @@ struct gcov_ctr_info {
 */
 struct gcov_info {
        unsigned int                    version;
+        struct gcov_module_info         *mod_info;
        struct gcov_info                *next;
        unsigned int                    stamp;
        const char                      *filename;
+        unsigned int                    eof_pos;
        unsigned int                    n_functions;
        const struct gcov_fn_info       *functions;
        unsigned int                    ctr_mask;
diff --git a/kernel/groups.c b/kernel/groups.c
index 6b2588dd04f..1cc476d52dd 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -2,7 +2,7 @@
 * Supplementary group IDs
 */
 #include <linux/cred.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -31,7 +31,7 @@ struct group_info *groups_alloc(int gidsetsize)
                group_info->blocks[0] = group_info->small_block;
        else {
                for (i = 0; i < nblocks; i++) {
-                        kgid_t *b;
+                        gid_t *b;
                        b = (void *)__get_free_page(GFP_USER);
                        if (!b)
                                goto out_undo_partial_alloc;
@@ -66,15 +66,18 @@ EXPORT_SYMBOL(groups_free);
 static int groups_to_user(gid_t __user *grouplist,
                          const struct group_info *group_info)
 {
-        struct user_namespace *user_ns = current_user_ns();
        int i;
        unsigned int count = group_info->ngroups;
-        for (i = 0; i < count; i++) {
+        for (i = 0; i < group_info->nblocks; i++) {
-                gid_t gid;
+                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-                gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i));
+                unsigned int len = cp_count * sizeof(*grouplist);
-                if (put_user(gid, grouplist+i))
+                if (copy_to_user(grouplist, group_info->blocks[i], len))
                        return -EFAULT;
+                grouplist += NGROUPS_PER_BLOCK;
+                count -= cp_count;
        }
        return 0;
 }
@@ -83,21 +86,18 @@ static int groups_to_user(gid_t __user *grouplist,
 static int groups_from_user(struct group_info *group_info,
    gid_t __user *grouplist)
 {
-        struct user_namespace *user_ns = current_user_ns();
        int i;
        unsigned int count = group_info->ngroups;
-        for (i = 0; i < count; i++) {
+        for (i = 0; i < group_info->nblocks; i++) {
-                gid_t gid;
+                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
-                kgid_t kgid;
+                unsigned int len = cp_count * sizeof(*grouplist);
-                if (get_user(gid, grouplist+i))
-                        return -EFAULT;
-                kgid = make_kgid(user_ns, gid);
+                if (copy_from_user(group_info->blocks[i], grouplist, len))
-                if (!gid_valid(kgid))
+                        return -EFAULT;
-                        return -EINVAL;
-                GROUP_AT(group_info, i) = kgid;
+                grouplist += NGROUPS_PER_BLOCK;
+                count -= cp_count;
        }
        return 0;
 }
@@ -117,9 +117,9 @@ static void groups_sort(struct group_info *group_info)
                for (base = 0; base < max; base++) {
                        int left = base;
                        int right = left + stride;
-                        kgid_t tmp = GROUP_AT(group_info, right);
+                        gid_t tmp = GROUP_AT(group_info, right);
-                        while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) {
+                        while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
                                GROUP_AT(group_info, right) =
                                    GROUP_AT(group_info, left);
                                right = left;
@@ -132,7 +132,7 @@ static void groups_sort(struct group_info *group_info)
 }
 /* a simple bsearch */
-int groups_search(const struct group_info *group_info, kgid_t grp)
+int groups_search(const struct group_info *group_info, gid_t grp)
 {
        unsigned int left, right;
@@ -143,9 +143,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
        right = group_info->ngroups;
        while (left < right) {
                unsigned int mid = (left+right)/2;
-                if (gid_gt(grp, GROUP_AT(group_info, mid)))
+                if (grp > GROUP_AT(group_info, mid))
                        left = mid + 1;
-                else if (gid_lt(grp, GROUP_AT(group_info, mid)))
+                else if (grp < GROUP_AT(group_info, mid))
                        right = mid;
                else
                        return 1;
@@ -256,24 +256,24 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
 /*
 * Check whether we're fsgid/egid or in the supplemental group..
 */
-int in_group_p(kgid_t grp)
+int in_group_p(gid_t grp)
 {
        const struct cred *cred = current_cred();
        int retval = 1;
-        if (!gid_eq(grp, cred->fsgid))
+        if (grp != cred->fsgid)
                retval = groups_search(cred->group_info, grp);
        return retval;
 }
 EXPORT_SYMBOL(in_group_p);
-int in_egroup_p(kgid_t grp)
+int in_egroup_p(gid_t grp)
 {
        const struct cred *cred = current_cred();
        int retval = 1;
-        if (!gid_eq(grp, cred->egid))
+        if (grp != cred->egid)
                retval = groups_search(cred->group_info, grp);
        return retval;
 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6db7a5ed52b..2043c08d36c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -32,7 +32,7 @@
 */
 #include <linux/cpu.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/hrtimer.h>
 #include <linux/notifier.h>
@@ -657,14 +657,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
        return 0;
 }
-static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
-{
-        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
-        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
-        return ktime_get_update_offsets(offs_real, offs_boot);
-}
 /*
 * Retrigger next event is called after clock was set
 *
@@ -673,12 +665,22 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 static void retrigger_next_event(void *arg)
 {
        struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+        struct timespec realtime_offset, xtim, wtm, sleep;
        if (!hrtimer_hres_active())
                return;
+        /* Optimized out for !HIGH_RES */
+        get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
+        set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
+        /* Adjust CLOCK_REALTIME offset */
        raw_spin_lock(&base->lock);
-        hrtimer_update_base(base);
+        base->clock_base[HRTIMER_BASE_REALTIME].offset =
+                timespec_to_ktime(realtime_offset);
+        base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
+                timespec_to_ktime(sleep);
        hrtimer_force_reprogram(base, 0);
        raw_spin_unlock(&base->lock);
 }
@@ -708,25 +710,13 @@ static int hrtimer_switch_to_hres(void)
                base->clock_base[i].resolution = KTIME_HIGH_RES;
        tick_setup_sched_timer();
        /* "Retrigger" the interrupt to get things going */
        retrigger_next_event(NULL);
        local_irq_restore(flags);
        return 1;
 }
-/*
- * Called from timekeeping code to reprogramm the hrtimer interrupt
- * device. If called from the timer interrupt context we defer it to
- * softirq context.
- */
-void clock_was_set_delayed(void)
-{
-        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-        cpu_base->clock_was_set = 1;
-        __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-}
 #else
 static inline int hrtimer_hres_active(void) { return 0; }
@@ -1260,10 +1250,11 @@ void hrtimer_interrupt(struct clock_event_device *dev)
        cpu_base->nr_events++;
        dev->next_event.tv64 = KTIME_MAX;
-        raw_spin_lock(&cpu_base->lock);
+        entry_time = now = ktime_get();
-        entry_time = now = hrtimer_update_base(cpu_base);
 retry:
        expires_next.tv64 = KTIME_MAX;
+        raw_spin_lock(&cpu_base->lock);
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
@@ -1339,12 +1330,8 @@ retry:
         * We need to prevent that we loop forever in the hrtimer
         * interrupt routine. We give it 3 attempts to avoid
         * overreacting on some spurious event.
-         *
-         * Acquire base lock for updating the offsets and retrieving
-         * the current time.
         */
-        raw_spin_lock(&cpu_base->lock);
+        now = ktime_get();
-        now = hrtimer_update_base(cpu_base);
        cpu_base->nr_retries++;
        if (++retries < 3)
                goto retry;
@@ -1356,7 +1343,6 @@ retry:
         */
        cpu_base->nr_hangs++;
        cpu_base->hang_detected = 1;
-        raw_spin_unlock(&cpu_base->lock);
        delta = ktime_sub(now, entry_time);
        if (delta.tv64 > cpu_base->max_hang_time.tv64)
                cpu_base->max_hang_time = delta;
@@ -1409,13 +1395,6 @@ void hrtimer_peek_ahead_timers(void)
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
-        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-        if (cpu_base->clock_was_set) {
-                cpu_base->clock_was_set = 0;
-                clock_was_set();
-        }
        hrtimer_peek_ahead_timers();
 }
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 6df614912b9..e972276f12f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -13,7 +13,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/lockdep.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/sysctl.h>
 /*
@@ -108,10 +108,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
        touch_nmi_watchdog();
-        if (sysctl_hung_task_panic) {
+        if (sysctl_hung_task_panic)
-                trigger_all_cpu_backtrace();
                panic("hung_task: blocked tasks");
-        }
 }
 /*
@@ -121,20 +119,15 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
 * to exit the grace period. For classic RCU, a reschedule is required.
 */
-static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
+static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
 {
-        bool can_cont;
        get_task_struct(g);
        get_task_struct(t);
        rcu_read_unlock();
        cond_resched();
        rcu_read_lock();
-        can_cont = pid_alive(g) && pid_alive(t);
        put_task_struct(t);
        put_task_struct(g);
-        return can_cont;
 }
 /*
@@ -161,7 +154,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
                        goto unlock;
                if (!--batch_count) {
                        batch_count = HUNG_TASK_BATCHING;
-                        if (!rcu_lock_break(g, t))
+                        rcu_lock_break(g, t);
+                        /* Exit if t or g was unhashed during refresh. */
+                        if (t->state == TASK_DEAD || g->state == TASK_DEAD)
                                goto unlock;
                }
                /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d1a758bc972..5a38bf4de64 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -13,7 +13,7 @@ config GENERIC_HARDIRQS
 # Options selectable by the architecture code
 # Make sparse irq Kconfig switch below available
-config MAY_HAVE_SPARSE_IRQ
+config HAVE_SPARSE_IRQ
       bool
 # Enable the generic irq autoprobe mechanism
@@ -56,22 +56,13 @@ config GENERIC_IRQ_CHIP
 config IRQ_DOMAIN
        bool
-config IRQ_DOMAIN_DEBUG
-        bool "Expose hardware/virtual IRQ mapping via debugfs"
-        depends on IRQ_DOMAIN && DEBUG_FS
-        help
-          This option will show the mapping relationship between hardware irq
-          numbers and Linux irq numbers. The mapping is exposed via debugfs
-          in the file "irq_domain_mapping".
-          If you don't know what this means you don't need it.
 # Support forced irq threading
 config IRQ_FORCED_THREADING
       bool
 config SPARSE_IRQ
-        bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
+        bool "Support sparse irq numbering"
+        depends on HAVE_SPARSE_IRQ
        ---help---
          Sparse irq numbering is useful for distro kernels that want
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 0119b9d467a..342d8f44e40 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -53,7 +53,7 @@ unsigned long probe_irq_on(void)
                        if (desc->irq_data.chip->irq_set_type)
                                desc->irq_data.chip->irq_set_type(&desc->irq_data,
                                                         IRQ_TYPE_PROBE);
-                        irq_startup(desc, false);
+                        irq_startup(desc);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
@@ -70,7 +70,7 @@ unsigned long probe_irq_on(void)
                raw_spin_lock_irq(&desc->lock);
                if (!desc->action && irq_settings_can_probe(desc)) {
                        desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
-                        if (irq_startup(desc, false))
+                        if (irq_startup(desc))
                                desc->istate |= IRQS_PENDING;
                }
                raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3aca9f29d30..dc5114b4c16 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -16,8 +16,6 @@
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
-#include <trace/events/irq.h>
 #include "internals.h"
 /**
@@ -28,7 +26,7 @@
 int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
        if (!desc)
                return -EINVAL;
@@ -56,14 +54,15 @@ EXPORT_SYMBOL(irq_set_chip);
 int irq_set_irq_type(unsigned int irq, unsigned int type)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
        int ret = 0;
        if (!desc)
                return -EINVAL;
        type &= IRQ_TYPE_SENSE_MASK;
-        ret = __irq_set_trigger(desc, irq, type);
+        if (type != IRQ_TYPE_NONE)
+                ret = __irq_set_trigger(desc, irq, type);
        irq_put_desc_busunlock(desc, flags);
        return ret;
 }
@@ -79,7 +78,7 @@ EXPORT_SYMBOL(irq_set_irq_type);
 int irq_set_handler_data(unsigned int irq, void *data)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
        if (!desc)
                return -EINVAL;
@@ -99,7 +98,7 @@ EXPORT_SYMBOL(irq_set_handler_data);
 int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
        if (!desc)
                return -EINVAL;
@@ -120,7 +119,7 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 int irq_set_chip_data(unsigned int irq, void *data)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
        if (!desc)
                return -EINVAL;
@@ -158,22 +157,19 @@ static void irq_state_set_masked(struct irq_desc *desc)
        irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
 }
-int irq_startup(struct irq_desc *desc, bool resend)
+int irq_startup(struct irq_desc *desc)
 {
-        int ret = 0;
        irq_state_clr_disabled(desc);
        desc->depth = 0;
        if (desc->irq_data.chip->irq_startup) {
-                ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
+                int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
                irq_state_clr_masked(desc);
-        } else {
+                return ret;
-                irq_enable(desc);
        }
-        if (resend)
-                check_irq_resend(desc, desc->irq_data.irq);
+        irq_enable(desc);
-        return ret;
+        return 0;
 }
 void irq_shutdown(struct irq_desc *desc)
@@ -208,24 +204,6 @@ void irq_disable(struct irq_desc *desc)
        }
 }
-void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
-{
-        if (desc->irq_data.chip->irq_enable)
-                desc->irq_data.chip->irq_enable(&desc->irq_data);
-        else
-                desc->irq_data.chip->irq_unmask(&desc->irq_data);
-        cpumask_set_cpu(cpu, desc->percpu_enabled);
-}
-void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
-{
-        if (desc->irq_data.chip->irq_disable)
-                desc->irq_data.chip->irq_disable(&desc->irq_data);
-        else
-                desc->irq_data.chip->irq_mask(&desc->irq_data);
-        cpumask_clear_cpu(cpu, desc->percpu_enabled);
-}
 static inline void mask_ack_irq(struct irq_desc *desc)
 {
        if (desc->irq_data.chip->irq_mask_ack)
@@ -272,14 +250,11 @@ void handle_nested_irq(unsigned int irq)
        raw_spin_lock_irq(&desc->lock);
-        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
        action = desc->action;
-        if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
+        if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
-                desc->istate |= IRQS_PENDING;
                goto out_unlock;
-        }
        irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
        raw_spin_unlock_irq(&desc->lock);
@@ -327,10 +302,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
-        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
-                desc->istate |= IRQS_PENDING;
                goto out_unlock;
-        }
        handle_irq_event(desc);
@@ -339,24 +312,6 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(handle_simple_irq);
-/*
- * Called unconditionally from handle_level_irq() and only for oneshot
- * interrupts from handle_fasteoi_irq()
- */
-static void cond_unmask_irq(struct irq_desc *desc)
-{
-        /*
-         * We need to unmask in the following cases:
-         * - Standard level irq (IRQF_ONESHOT is not set)
-         * - Oneshot irq which did not wake the thread (caused by a
-         *   spurious interrupt or a primary handler handling it
-         *   completely).
-         */
-        if (!irqd_irq_disabled(&desc->irq_data) &&
-            irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot)
-                unmask_irq(desc);
-}
 /**
 *      handle_level_irq - Level type irq handler
 *      @irq:   the interrupt number
@@ -384,15 +339,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
         * If its disabled or no action available
         * keep it masked and get out of here
         */
-        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
-                desc->istate |= IRQS_PENDING;
                goto out_unlock;
-        }
        handle_irq_event(desc);
-        cond_unmask_irq(desc);
+        if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
+                unmask_irq(desc);
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
@@ -446,9 +399,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        preflow_handler(desc);
        handle_irq_event(desc);
-        if (desc->istate & IRQS_ONESHOT)
-                cond_unmask_irq(desc);
 out_eoi:
        desc->irq_data.chip->irq_eoi(&desc->irq_data);
 out_unlock:
@@ -525,7 +475,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
-EXPORT_SYMBOL(handle_edge_irq);
 #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
 /**
@@ -595,44 +544,12 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
                chip->irq_eoi(&desc->irq_data);
 }
-/**
- * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
- * @irq:        the interrupt number
- * @desc:       the interrupt description structure for this irq
- *
- * Per CPU interrupts on SMP machines without locking requirements. Same as
- * handle_percpu_irq() above but with the following extras:
- *
- * action->percpu_dev_id is a pointer to percpu variables which
- * contain the real device id for the cpu on which this handler is
- * called
- */
-void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
-{
-        struct irq_chip *chip = irq_desc_get_chip(desc);
-        struct irqaction *action = desc->action;
-        void *dev_id = __this_cpu_ptr(action->percpu_dev_id);
-        irqreturn_t res;
-        kstat_incr_irqs_this_cpu(irq, desc);
-        if (chip->irq_ack)
-                chip->irq_ack(&desc->irq_data);
-        trace_irq_handler_entry(irq, action);
-        res = action->handler(irq, dev_id);
-        trace_irq_handler_exit(irq, action, res);
-        if (chip->irq_eoi)
-                chip->irq_eoi(&desc->irq_data);
-}
 void
 __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                  const char *name)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
        if (!desc)
                return;
@@ -658,7 +575,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                irq_settings_set_noprobe(desc);
                irq_settings_set_norequest(desc);
                irq_settings_set_nothread(desc);
-                irq_startup(desc, true);
+                irq_startup(desc);
        }
 out:
        irq_put_desc_busunlock(desc, flags);
@@ -672,12 +589,11 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
        irq_set_chip(irq, chip);
        __irq_set_handler(irq, handle, 0, name);
 }
-EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
 void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
        if (!desc)
                return;
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index e75e29e4434..97a8bfadc88 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -4,10 +4,10 @@
 #include <linux/kallsyms.h>
-#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
+#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
-#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f)
+#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
 /* FIXME */
-#define ___PD(f) do { } while (0)
+#define PD(f) do { } while (0)
 static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
 {
@@ -23,23 +23,23 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
                print_symbol("%s\n", (unsigned long)desc->action->handler);
        }
-        ___P(IRQ_LEVEL);
+        P(IRQ_LEVEL);
-        ___P(IRQ_PER_CPU);
+        P(IRQ_PER_CPU);
-        ___P(IRQ_NOPROBE);
+        P(IRQ_NOPROBE);
-        ___P(IRQ_NOREQUEST);
+        P(IRQ_NOREQUEST);
-        ___P(IRQ_NOTHREAD);
+        P(IRQ_NOTHREAD);
-        ___P(IRQ_NOAUTOEN);
+        P(IRQ_NOAUTOEN);
-        ___PS(IRQS_AUTODETECT);
+        PS(IRQS_AUTODETECT);
-        ___PS(IRQS_REPLAY);
+        PS(IRQS_REPLAY);
-        ___PS(IRQS_WAITING);
+        PS(IRQS_WAITING);
-        ___PS(IRQS_PENDING);
+        PS(IRQS_PENDING);
-        ___PD(IRQS_INPROGRESS);
+        PD(IRQS_INPROGRESS);
-        ___PD(IRQS_DISABLED);
+        PD(IRQS_DISABLED);
-        ___PD(IRQS_MASKED);
+        PD(IRQS_MASKED);
 }
-#undef ___P
+#undef P
-#undef ___PS
+#undef PS
-#undef ___PD
+#undef PD
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 988dc58e884..b5fcd96c710 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -6,7 +6,6 @@
 */
 #include <linux/interrupt.h>
 #include <linux/irq.h>
-#include <linux/export.h>
 #include "internals.h"
@@ -58,4 +57,3 @@ struct irq_chip dummy_irq_chip = {
        .irq_mask       = noop,
        .irq_unmask     = noop,
 };
-EXPORT_SYMBOL_GPL(dummy_irq_chip);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c89295a8f66..e38544dddb1 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -6,7 +6,6 @@
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/slab.h>
-#include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/syscore_ops.h>
@@ -212,7 +211,6 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
        }
        return gc;
 }
-EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
 /*
 * Separate lockdep class for interrupt chip which can nest irq_desc
@@ -260,7 +258,6 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
        }
        gc->irq_cnt = i - gc->irq_base;
 }
-EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
 /**
 * irq_setup_alt_chip - Switch to alternative chip
@@ -284,7 +281,6 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
        }
        return -EINVAL;
 }
-EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
 /**
 * irq_remove_generic_chip - Remove a chip
@@ -315,7 +311,6 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
                irq_modify_status(i, clr, set);
        }
 }
-EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
 #ifdef CONFIG_PM
 static int irq_gc_suspend(void)
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 131ca176b49..470d08c82bb 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,18 +54,14 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
 {
        /*
-         * In case the thread crashed and was killed we just pretend that
+         * Wake up the handler thread for this action. In case the
-         * we handled the interrupt. The hardirq handler has disabled the
+         * thread crashed and was killed we just pretend that we
-         * device interrupt, so no irq storm is lurking.
+         * handled the interrupt. The hardirq handler has disabled the
-         */
+         * device interrupt, so no irq storm is lurking. If the
-        if (action->thread->flags & PF_EXITING)
-                return;
-        /*
-         * Wake up the handler thread for this action. If the
         * RUNTHREAD bit is already set, nothing to do.
         */
-        if (test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+        if (test_bit(IRQTF_DIED, &action->thread_flags) ||
+            test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
                return;
        /*
@@ -114,18 +110,6 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
         * threads_oneshot untouched and runs the thread another time.
         */
        desc->threads_oneshot |= action->thread_mask;
-        /*
-         * We increment the threads_active counter in case we wake up
-         * the irq thread. The irq thread decrements the counter when
-         * it returns from the handler or in the exit path and wakes
-         * up waiters which are stuck in synchronize_irq() when the
-         * active count becomes zero. synchronize_irq() is serialized
-         * against this code (hard irq handler) via IRQS_INPROGRESS
-         * like the finalize_oneshot() code. See comment above.
-         */
-        atomic_inc(&desc->threads_active);
        wake_up_process(action->thread);
 }
@@ -133,7 +117,7 @@ irqreturn_t
 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
        irqreturn_t retval = IRQ_NONE;
-        unsigned int flags = 0, irq = desc->irq_data.irq;
+        unsigned int random = 0, irq = desc->irq_data.irq;
        do {
                irqreturn_t res;
@@ -161,7 +145,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
                        /* Fall through to add to randomness */
                case IRQ_HANDLED:
-                        flags |= action->flags;
+                        random |= action->flags;
                        break;
                default:
@@ -172,7 +156,8 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
                action = action->next;
        } while (action);
-        add_interrupt_randomness(irq, flags);
+        if (random & IRQF_SAMPLE_RANDOM)
+                add_interrupt_randomness(irq);
        if (!noirqdebug)
                note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 001fa5bab49..6546431447d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,17 +15,19 @@
 #define istate core_internal_state__do_not_mess_with_it
-extern bool noirqdebug;
+extern int noirqdebug;
 /*
 * Bits used by threaded handlers:
 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
+ * IRQTF_DIED      - handler thread died
 * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
 * IRQTF_AFFINITY  - irq thread is requested to adjust affinity
 * IRQTF_FORCED_THREAD  - irq action is force threaded
 */
 enum {
        IRQTF_RUNTHREAD,
+        IRQTF_DIED,
        IRQTF_WARNED,
        IRQTF_AFFINITY,
        IRQTF_FORCED_THREAD,
@@ -65,12 +67,10 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
-extern int irq_startup(struct irq_desc *desc, bool resend);
+extern int irq_startup(struct irq_desc *desc);
 extern void irq_shutdown(struct irq_desc *desc);
 extern void irq_enable(struct irq_desc *desc);
 extern void irq_disable(struct irq_desc *desc);
-extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
-extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
 extern void mask_irq(struct irq_desc *desc);
 extern void unmask_irq(struct irq_desc *desc);
@@ -101,9 +101,6 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 extern void irq_set_thread_affinity(struct irq_desc *desc);
-extern int irq_do_set_affinity(struct irq_data *data,
-                               const struct cpumask *dest, bool force);
 /* Inline functions for support of irq chips on slow busses */
 static inline void chip_bus_lock(struct irq_desc *desc)
 {
@@ -117,21 +114,14 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
                desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
 }
-#define _IRQ_DESC_CHECK         (1 << 0)
-#define _IRQ_DESC_PERCPU        (1 << 1)
-#define IRQ_GET_DESC_CHECK_GLOBAL       (_IRQ_DESC_CHECK)
-#define IRQ_GET_DESC_CHECK_PERCPU       (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
 struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
-                    unsigned int check);
 void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
 static inline struct irq_desc *
-irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
+irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
 {
-        return __irq_get_desc_lock(irq, flags, true, check);
+        return __irq_get_desc_lock(irq, flags, true);
 }
 static inline void
@@ -141,9 +131,9 @@ irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
 }
 static inline struct irq_desc *
-irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
+irq_get_desc_lock(unsigned int irq, unsigned long *flags)
 {
-        return __irq_get_desc_lock(irq, flags, false, check);
+        return __irq_get_desc_lock(irq, flags, false);
 }
 static inline void
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 192a302d6cf..039b889ea05 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -9,7 +9,7 @@
 */
 #include <linux/irq.h>
 #include <linux/slab.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/radix-tree.h>
@@ -112,7 +112,6 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 {
        return radix_tree_lookup(&irq_desc_tree, irq);
 }
-EXPORT_SYMBOL(irq_to_desc);
 static void delete_irq_desc(unsigned int irq)
 {
@@ -425,22 +424,11 @@ unsigned int irq_get_next_irq(unsigned int offset)
 }
 struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
-                    unsigned int check)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        if (desc) {
-                if (check & _IRQ_DESC_CHECK) {
-                        if ((check & _IRQ_DESC_PERCPU) &&
-                            !irq_settings_is_per_cpu_devid(desc))
-                                return NULL;
-                        if (!(check & _IRQ_DESC_PERCPU) &&
-                            irq_settings_is_per_cpu_devid(desc))
-                                return NULL;
-                }
                if (bus)
                        chip_bus_lock(desc);
                raw_spin_lock_irqsave(&desc->lock, *flags);
@@ -455,25 +443,6 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
                chip_bus_sync_unlock(desc);
 }
-int irq_set_percpu_devid(unsigned int irq)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        if (!desc)
-                return -EINVAL;
-        if (desc->percpu_enabled)
-                return -EINVAL;
-        desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
-        if (!desc->percpu_enabled)
-                return -ENOMEM;
-        irq_set_percpu_devid_flags(irq);
-        return 0;
-}
 /**
 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
 * @irq:        irq number to initialize
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 96f3a1d9c37..b57a3776de4 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,925 +1,184 @@
-#define pr_fmt(fmt)  "irq: " fmt
-#include <linux/debugfs.h>
-#include <linux/hardirq.h>
-#include <linux/interrupt.h>
 #include <linux/irq.h>
-#include <linux/irqdesc.h>
 #include <linux/irqdomain.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
-#include <linux/topology.h>
-#include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
-                                 * ie. legacy 8259, gets irqs 1..15 */
-#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
-#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
-#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
 static LIST_HEAD(irq_domain_list);
 static DEFINE_MUTEX(irq_domain_mutex);
-static DEFINE_MUTEX(revmap_trees_mutex);
-static struct irq_domain *irq_default_domain;
 /**
- * irq_domain_alloc() - Allocate a new irq_domain data structure
+ * irq_domain_add() - Register an irq_domain
- * @of_node: optional device-tree node of the interrupt controller
+ * @domain: ptr to initialized irq_domain structure
- * @revmap_type: type of reverse mapping to use
- * @ops: map/unmap domain callbacks
- * @host_data: Controller private data pointer
 *
- * Allocates and initialize and irq_domain structure.  Caller is expected to
+ * Registers an irq_domain structure.  The irq_domain must at a minimum be
- * register allocated irq_domain with irq_domain_register().  Returns pointer
+ * initialized with an ops structure pointer, and either a ->to_irq hook or
- * to IRQ domain, or NULL on failure.
+ * a valid irq_base value.  Everything else is optional.
 */
-static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
+void irq_domain_add(struct irq_domain *domain)
-                                           unsigned int revmap_type,
-                                           const struct irq_domain_ops *ops,
-                                           void *host_data)
-{
-        struct irq_domain *domain;
-        domain = kzalloc_node(sizeof(*domain), GFP_KERNEL,
-                              of_node_to_nid(of_node));
-        if (WARN_ON(!domain))
-                return NULL;
-        /* Fill structure */
-        domain->revmap_type = revmap_type;
-        domain->ops = ops;
-        domain->host_data = host_data;
-        domain->of_node = of_node_get(of_node);
-        return domain;
-}
-static void irq_domain_free(struct irq_domain *domain)
 {
-        of_node_put(domain->of_node);
+        struct irq_data *d;
-        kfree(domain);
+        int hwirq;
-}
-static void irq_domain_add(struct irq_domain *domain)
-{
-        mutex_lock(&irq_domain_mutex);
-        list_add(&domain->link, &irq_domain_list);
-        mutex_unlock(&irq_domain_mutex);
-        pr_debug("Allocated domain of type %d @0x%p\n",
-                 domain->revmap_type, domain);
-}
-/**
- * irq_domain_remove() - Remove an irq domain.
- * @domain: domain to remove
- *
- * This routine is used to remove an irq domain. The caller must ensure
- * that all mappings within the domain have been disposed of prior to
- * use, depending on the revmap type.
- */
-void irq_domain_remove(struct irq_domain *domain)
-{
-        mutex_lock(&irq_domain_mutex);
-        switch (domain->revmap_type) {
-        case IRQ_DOMAIN_MAP_LEGACY:
-                /*
-                 * Legacy domains don't manage their own irq_desc
-                 * allocations, we expect the caller to handle irq_desc
-                 * freeing on their own.
-                 */
-                break;
-        case IRQ_DOMAIN_MAP_TREE:
-                /*
-                 * radix_tree_delete() takes care of destroying the root
-                 * node when all entries are removed. Shout if there are
-                 * any mappings left.
-                 */
-                WARN_ON(domain->revmap_data.tree.height);
-                break;
-        case IRQ_DOMAIN_MAP_LINEAR:
-                kfree(domain->revmap_data.linear.revmap);
-                domain->revmap_data.linear.size = 0;
-                break;
-        case IRQ_DOMAIN_MAP_NOMAP:
-                break;
-        }
-        list_del(&domain->link);
        /*
-         * If the going away domain is the default one, reset it.
+         * This assumes that the irq_domain owner has already allocated
+         * the irq_descs.  This block will be removed when support for dynamic
+         * allocation of irq_descs is added to irq_domain.
         */
-        if (unlikely(irq_default_domain == domain))
+        for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
-                irq_set_default_host(NULL);
+                d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+                if (!d) {
-        mutex_unlock(&irq_domain_mutex);
+                        WARN(1, "error: assigning domain to non existant irq_desc");
+                        return;
-        pr_debug("Removed domain of type %d @0x%p\n",
+                }
-                 domain->revmap_type, domain);
+                if (d->domain) {
+                        /* things are broken; just report, don't clean up */
-        irq_domain_free(domain);
+                        WARN(1, "error: irq_desc already assigned to a domain");
-}
+                        return;
-EXPORT_SYMBOL_GPL(irq_domain_remove);
-static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
-                                             irq_hw_number_t hwirq)
-{
-        irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
-        int size = domain->revmap_data.legacy.size;
-        if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
-                return 0;
-        return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
-}
-/**
- * irq_domain_add_simple() - Allocate and register a simple irq_domain.
- * @of_node: pointer to interrupt controller's device tree node.
- * @size: total number of irqs in mapping
- * @first_irq: first number of irq block assigned to the domain
- * @ops: map/unmap domain callbacks
- * @host_data: Controller private data pointer
- *
- * Allocates a legacy irq_domain if irq_base is positive or a linear
- * domain otherwise. For the legacy domain, IRQ descriptors will also
- * be allocated.
- *
- * This is intended to implement the expected behaviour for most
- * interrupt controllers which is that a linear mapping should
- * normally be used unless the system requires a legacy mapping in
- * order to support supplying interrupt numbers during non-DT
- * registration of devices.
- */
-struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
-                                         unsigned int size,
-                                         unsigned int first_irq,
-                                         const struct irq_domain_ops *ops,
-                                         void *host_data)
-{
-        if (first_irq > 0) {
-                int irq_base;
-                if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
-                        /*
-                         * Set the descriptor allocator to search for a
-                         * 1-to-1 mapping, such as irq_alloc_desc_at().
-                         * Use of_node_to_nid() which is defined to
-                         * numa_node_id() on platforms that have no custom
-                         * implementation.
-                         */
-                        irq_base = irq_alloc_descs(first_irq, first_irq, size,
-                                                   of_node_to_nid(of_node));
-                        if (irq_base < 0) {
-                                pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
-                                        first_irq);
-                                irq_base = first_irq;
-                        }
-                } else
-                        irq_base = first_irq;
-                return irq_domain_add_legacy(of_node, size, irq_base, 0,
-                                             ops, host_data);
-        }
-        /* A linear domain is the default */
-        return irq_domain_add_linear(of_node, size, ops, host_data);
-}
-/**
- * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
- * @of_node: pointer to interrupt controller's device tree node.
- * @size: total number of irqs in legacy mapping
- * @first_irq: first number of irq block assigned to the domain
- * @first_hwirq: first hwirq number to use for the translation. Should normally
- *               be '0', but a positive integer can be used if the effective
- *               hwirqs numbering does not begin at zero.
- * @ops: map/unmap domain callbacks
- * @host_data: Controller private data pointer
- *
- * Note: the map() callback will be called before this function returns
- * for all legacy interrupts except 0 (which is always the invalid irq for
- * a legacy controller).
- */
-struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
-                                         unsigned int size,
-                                         unsigned int first_irq,
-                                         irq_hw_number_t first_hwirq,
-                                         const struct irq_domain_ops *ops,
-                                         void *host_data)
-{
-        struct irq_domain *domain;
-        unsigned int i;
-        domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data);
-        if (!domain)
-                return NULL;
-        domain->revmap_data.legacy.first_irq = first_irq;
-        domain->revmap_data.legacy.first_hwirq = first_hwirq;
-        domain->revmap_data.legacy.size = size;
-        mutex_lock(&irq_domain_mutex);
-        /* Verify that all the irqs are available */
-        for (i = 0; i < size; i++) {
-                int irq = first_irq + i;
-                struct irq_data *irq_data = irq_get_irq_data(irq);
-                if (WARN_ON(!irq_data || irq_data->domain)) {
-                        mutex_unlock(&irq_domain_mutex);
-                        irq_domain_free(domain);
-                        return NULL;
                }
+                d->domain = domain;
+                d->hwirq = hwirq;
        }
-        /* Claim all of the irqs before registering a legacy domain */
+        mutex_lock(&irq_domain_mutex);
-        for (i = 0; i < size; i++) {
+        list_add(&domain->list, &irq_domain_list);
-                struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
-                irq_data->hwirq = first_hwirq + i;
-                irq_data->domain = domain;
-        }
        mutex_unlock(&irq_domain_mutex);
-        for (i = 0; i < size; i++) {
-                int irq = first_irq + i;
-                int hwirq = first_hwirq + i;
-                /* IRQ0 gets ignored */
-                if (!irq)
-                        continue;
-                /* Legacy flags are left to default at this point,
-                 * one can then use irq_create_mapping() to
-                 * explicitly change them
-                 */
-                if (ops->map)
-                        ops->map(domain, irq, hwirq);
-                /* Clear norequest flags */
-                irq_clear_status_flags(irq, IRQ_NOREQUEST);
-        }
-        irq_domain_add(domain);
-        return domain;
-}
-EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
-/**
- * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
- * @of_node: pointer to interrupt controller's device tree node.
- * @size: Number of interrupts in the domain.
- * @ops: map/unmap domain callbacks
- * @host_data: Controller private data pointer
- */
-struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
-                                         unsigned int size,
-                                         const struct irq_domain_ops *ops,
-                                         void *host_data)
-{
-        struct irq_domain *domain;
-        unsigned int *revmap;
-        revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL,
-                              of_node_to_nid(of_node));
-        if (WARN_ON(!revmap))
-                return NULL;
-        domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
-        if (!domain) {
-                kfree(revmap);
-                return NULL;
-        }
-        domain->revmap_data.linear.size = size;
-        domain->revmap_data.linear.revmap = revmap;
-        irq_domain_add(domain);
-        return domain;
 }
-EXPORT_SYMBOL_GPL(irq_domain_add_linear);
-struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
-                                         unsigned int max_irq,
-                                         const struct irq_domain_ops *ops,
-                                         void *host_data)
-{
-        struct irq_domain *domain = irq_domain_alloc(of_node,
-                                        IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
-        if (domain) {
-                domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0;
-                irq_domain_add(domain);
-        }
-        return domain;
-}
-EXPORT_SYMBOL_GPL(irq_domain_add_nomap);
 /**
- * irq_domain_add_tree()
+ * irq_domain_del() - Unregister an irq_domain
- * @of_node: pointer to interrupt controller's device tree node.
+ * @domain: ptr to registered irq_domain.
- * @ops: map/unmap domain callbacks
- *
- * Note: The radix tree will be allocated later during boot automatically
- * (the reverse mapping will use the slow path until that happens).
 */
-struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
+void irq_domain_del(struct irq_domain *domain)
-                                         const struct irq_domain_ops *ops,
-                                         void *host_data)
 {
-        struct irq_domain *domain = irq_domain_alloc(of_node,
+        struct irq_data *d;
-                                        IRQ_DOMAIN_MAP_TREE, ops, host_data);
+        int hwirq;
-        if (domain) {
-                INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
-                irq_domain_add(domain);
-        }
-        return domain;
-}
-EXPORT_SYMBOL_GPL(irq_domain_add_tree);
-/**
- * irq_find_host() - Locates a domain for a given device node
- * @node: device-tree node of the interrupt controller
- */
-struct irq_domain *irq_find_host(struct device_node *node)
-{
-        struct irq_domain *h, *found = NULL;
-        int rc;
-        /* We might want to match the legacy controller last since
-         * it might potentially be set to match all interrupts in
-         * the absence of a device node. This isn't a problem so far
-         * yet though...
-         */
        mutex_lock(&irq_domain_mutex);
-        list_for_each_entry(h, &irq_domain_list, link) {
+        list_del(&domain->list);
-                if (h->ops->match)
-                        rc = h->ops->match(h, node);
-                else
-                        rc = (h->of_node != NULL) && (h->of_node == node);
-                if (rc) {
-                        found = h;
-                        break;
-                }
-        }
        mutex_unlock(&irq_domain_mutex);
-        return found;
-}
-EXPORT_SYMBOL_GPL(irq_find_host);
-/**
- * irq_set_default_host() - Set a "default" irq domain
- * @domain: default domain pointer
- *
- * For convenience, it's possible to set a "default" domain that will be used
- * whenever NULL is passed to irq_create_mapping(). It makes life easier for
- * platforms that want to manipulate a few hard coded interrupt numbers that
- * aren't properly represented in the device-tree.
- */
-void irq_set_default_host(struct irq_domain *domain)
-{
-        pr_debug("Default domain set to @0x%p\n", domain);
-        irq_default_domain = domain;
-}
-EXPORT_SYMBOL_GPL(irq_set_default_host);
-static void irq_domain_disassociate_many(struct irq_domain *domain,
-                                         unsigned int irq_base, int count)
-{
-        /*
-         * disassociate in reverse order;
-         * not strictly necessary, but nice for unwinding
-         */
-        while (count--) {
-                int irq = irq_base + count;
-                struct irq_data *irq_data = irq_get_irq_data(irq);
-                irq_hw_number_t hwirq = irq_data->hwirq;
-                if (WARN_ON(!irq_data || irq_data->domain != domain))
-                        continue;
-                irq_set_status_flags(irq, IRQ_NOREQUEST);
-                /* remove chip and handler */
-                irq_set_chip_and_handler(irq, NULL, NULL);
-                /* Make sure it's completed */
-                synchronize_irq(irq);
-                /* Tell the PIC about it */
-                if (domain->ops->unmap)
-                        domain->ops->unmap(domain, irq);
-                smp_mb();
-                irq_data->domain = NULL;
-                irq_data->hwirq = 0;
-                /* Clear reverse map */
-                switch(domain->revmap_type) {
-                case IRQ_DOMAIN_MAP_LINEAR:
-                        if (hwirq < domain->revmap_data.linear.size)
-                                domain->revmap_data.linear.revmap[hwirq] = 0;
-                        break;
-                case IRQ_DOMAIN_MAP_TREE:
-                        mutex_lock(&revmap_trees_mutex);
-                        radix_tree_delete(&domain->revmap_data.tree, hwirq);
-                        mutex_unlock(&revmap_trees_mutex);
-                        break;
-                }
-        }
-}
-int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
-                              irq_hw_number_t hwirq_base, int count)
-{
-        unsigned int virq = irq_base;
-        irq_hw_number_t hwirq = hwirq_base;
-        int i, ret;
-        pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
-                of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
-        for (i = 0; i < count; i++) {
-                struct irq_data *irq_data = irq_get_irq_data(virq + i);
-                if (WARN(!irq_data, "error: irq_desc not allocated; "
-                         "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
-                        return -EINVAL;
-                if (WARN(irq_data->domain, "error: irq_desc already associated; "
-                         "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
-                        return -EINVAL;
-        };
-        for (i = 0; i < count; i++, virq++, hwirq++) {
-                struct irq_data *irq_data = irq_get_irq_data(virq);
-                irq_data->hwirq = hwirq;
-                irq_data->domain = domain;
-                if (domain->ops->map) {
-                        ret = domain->ops->map(domain, virq, hwirq);
-                        if (ret != 0) {
-                                pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
-                                       virq, hwirq, ret);
-                                WARN_ON(1);
-                                irq_data->domain = NULL;
-                                irq_data->hwirq = 0;
-                                goto err_unmap;
-                        }
-                }
-                switch (domain->revmap_type) {
-                case IRQ_DOMAIN_MAP_LINEAR:
-                        if (hwirq < domain->revmap_data.linear.size)
-                                domain->revmap_data.linear.revmap[hwirq] = virq;
-                        break;
-                case IRQ_DOMAIN_MAP_TREE:
-                        mutex_lock(&revmap_trees_mutex);
-                        radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
-                        mutex_unlock(&revmap_trees_mutex);
-                        break;
-                }
-                irq_clear_status_flags(virq, IRQ_NOREQUEST);
-        }
-        return 0;
- err_unmap:
-        irq_domain_disassociate_many(domain, irq_base, i);
-        return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(irq_domain_associate_many);
-/**
- * irq_create_direct_mapping() - Allocate an irq for direct mapping
- * @domain: domain to allocate the irq for or NULL for default domain
- *
- * This routine is used for irq controllers which can choose the hardware
- * interrupt numbers they generate. In such a case it's simplest to use
- * the linux irq as the hardware interrupt number.
- */
-unsigned int irq_create_direct_mapping(struct irq_domain *domain)
-{
-        unsigned int virq;
-        if (domain == NULL)
-                domain = irq_default_domain;
-        if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP))
-                return 0;
-        virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
-        if (!virq) {
-                pr_debug("create_direct virq allocation failed\n");
-                return 0;
-        }
-        if (virq >= domain->revmap_data.nomap.max_irq) {
-                pr_err("ERROR: no free irqs available below %i maximum\n",
-                        domain->revmap_data.nomap.max_irq);
-                irq_free_desc(virq);
-                return 0;
-        }
-        pr_debug("create_direct obtained virq %d\n", virq);
-        if (irq_domain_associate(domain, virq, virq)) {
+        /* Clear the irq_domain assignments */
-                irq_free_desc(virq);
+        for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
-                return 0;
+                d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+                d->domain = NULL;
        }
-        return virq;
-}
-EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
-/**
- * irq_create_mapping() - Map a hardware interrupt into linux irq space
- * @domain: domain owning this hardware interrupt or NULL for default domain
- * @hwirq: hardware irq number in that domain space
- *
- * Only one mapping per hardware interrupt is permitted. Returns a linux
- * irq number.
- * If the sense/trigger is to be specified, set_irq_type() should be called
- * on the number returned from that call.
- */
-unsigned int irq_create_mapping(struct irq_domain *domain,
-                                irq_hw_number_t hwirq)
-{
-        unsigned int hint;
-        int virq;
-        pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
-        /* Look for default domain if nececssary */
-        if (domain == NULL)
-                domain = irq_default_domain;
-        if (domain == NULL) {
-                pr_warning("irq_create_mapping called for"
-                           " NULL domain, hwirq=%lx\n", hwirq);
-                WARN_ON(1);
-                return 0;
-        }
-        pr_debug("-> using domain @%p\n", domain);
-        /* Check if mapping already exists */
-        virq = irq_find_mapping(domain, hwirq);
-        if (virq) {
-                pr_debug("-> existing mapping on virq %d\n", virq);
-                return virq;
-        }
-        /* Get a virtual interrupt number */
-        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
-                return irq_domain_legacy_revmap(domain, hwirq);
-        /* Allocate a virtual interrupt number */
-        hint = hwirq % nr_irqs;
-        if (hint == 0)
-                hint++;
-        virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
-        if (virq <= 0)
-                virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
-        if (virq <= 0) {
-                pr_debug("-> virq allocation failed\n");
-                return 0;
-        }
-        if (irq_domain_associate(domain, virq, hwirq)) {
-                irq_free_desc(virq);
-                return 0;
-        }
-        pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
-                hwirq, of_node_full_name(domain->of_node), virq);
-        return virq;
 }
-EXPORT_SYMBOL_GPL(irq_create_mapping);
+#if defined(CONFIG_OF_IRQ)
 /**
- * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
+ * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec
- * @domain: domain owning the interrupt range
- * @irq_base: beginning of linux IRQ range
- * @hwirq_base: beginning of hardware IRQ range
- * @count: Number of interrupts to map
 *
- * This routine is used for allocating and mapping a range of hardware
+ * Used by the device tree interrupt mapping code to translate a device tree
- * irqs to linux irqs where the linux irq numbers are at pre-defined
+ * interrupt specifier to a valid linux irq number.  Returns either a valid
- * locations. For use by controllers that already have static mappings
+ * linux IRQ number or 0.
- * to insert in to the domain.
 *
- * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
+ * When the caller no longer need the irq number returned by this function it
- * domain insertion.
+ * should arrange to call irq_dispose_mapping().
- *
- * 0 is returned upon success, while any failure to establish a static
- * mapping is treated as an error.
 */
-int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
-                               irq_hw_number_t hwirq_base, int count)
-{
-        int ret;
-        ret = irq_alloc_descs(irq_base, irq_base, count,
-                              of_node_to_nid(domain->of_node));
-        if (unlikely(ret < 0))
-                return ret;
-        ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count);
-        if (unlikely(ret < 0)) {
-                irq_free_descs(irq_base, count);
-                return ret;
-        }
-        return 0;
-}
-EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
 unsigned int irq_create_of_mapping(struct device_node *controller,
                                   const u32 *intspec, unsigned int intsize)
 {
        struct irq_domain *domain;
-        irq_hw_number_t hwirq;
+        unsigned long hwirq;
-        unsigned int type = IRQ_TYPE_NONE;
+        unsigned int irq, type;
-        unsigned int virq;
+        int rc = -EINVAL;
-        domain = controller ? irq_find_host(controller) : irq_default_domain;
+        /* Find a domain which can translate the irq spec */
-        if (!domain) {
+        mutex_lock(&irq_domain_mutex);
-#ifdef CONFIG_MIPS
+        list_for_each_entry(domain, &irq_domain_list, list) {
-                /*
+                if (!domain->ops->dt_translate)
-                 * Workaround to avoid breaking interrupt controller drivers
+                        continue;
-                 * that don't yet register an irq_domain.  This is temporary
+                rc = domain->ops->dt_translate(domain, controller,
-                 * code. ~~~gcl, Feb 24, 2012
+                                        intspec, intsize, &hwirq, &type);
-                 *
+                if (rc == 0)
-                 * Scheduled for removal in Linux v3.6.  That should be enough
+                        break;
-                 * time.
-                 */
-                if (intsize > 0)
-                        return intspec[0];
-#endif
-                pr_warning("no irq domain found for %s !\n",
-                           of_node_full_name(controller));
-                return 0;
-        }
-        /* If domain has no translation, then we assume interrupt line */
-        if (domain->ops->xlate == NULL)
-                hwirq = intspec[0];
-        else {
-                if (domain->ops->xlate(domain, controller, intspec, intsize,
-                                     &hwirq, &type))
-                        return 0;
        }
+        mutex_unlock(&irq_domain_mutex);
-        /* Create mapping */
+        if (rc != 0)
-        virq = irq_create_mapping(domain, hwirq);
-        if (!virq)
-                return virq;
-        /* Set type if specified and different than the current one */
-        if (type != IRQ_TYPE_NONE &&
-            type != (irqd_get_trigger_type(irq_get_irq_data(virq))))
-                irq_set_irq_type(virq, type);
-        return virq;
-}
-EXPORT_SYMBOL_GPL(irq_create_of_mapping);
-/**
- * irq_dispose_mapping() - Unmap an interrupt
- * @virq: linux irq number of the interrupt to unmap
- */
-void irq_dispose_mapping(unsigned int virq)
-{
-        struct irq_data *irq_data = irq_get_irq_data(virq);
-        struct irq_domain *domain;
-        if (!virq || !irq_data)
-                return;
-        domain = irq_data->domain;
-        if (WARN_ON(domain == NULL))
-                return;
-        /* Never unmap legacy interrupts */
-        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
-                return;
-        irq_domain_disassociate_many(domain, virq, 1);
-        irq_free_desc(virq);
-}
-EXPORT_SYMBOL_GPL(irq_dispose_mapping);
-/**
- * irq_find_mapping() - Find a linux irq from an hw irq number.
- * @domain: domain owning this hardware interrupt
- * @hwirq: hardware irq number in that domain space
- */
-unsigned int irq_find_mapping(struct irq_domain *domain,
-                              irq_hw_number_t hwirq)
-{
-        struct irq_data *data;
-        /* Look for default domain if nececssary */
-        if (domain == NULL)
-                domain = irq_default_domain;
-        if (domain == NULL)
                return 0;
-        switch (domain->revmap_type) {
+        irq = irq_domain_to_irq(domain, hwirq);
-        case IRQ_DOMAIN_MAP_LEGACY:
+        if (type != IRQ_TYPE_NONE)
-                return irq_domain_legacy_revmap(domain, hwirq);
+                irq_set_irq_type(irq, type);
-        case IRQ_DOMAIN_MAP_LINEAR:
+        pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n",
-                return irq_linear_revmap(domain, hwirq);
+                 controller->full_name, (int)hwirq, irq, type);
-        case IRQ_DOMAIN_MAP_TREE:
+        return irq;
-                rcu_read_lock();
-                data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
-                rcu_read_unlock();
-                if (data)
-                        return data->irq;
-                break;
-        case IRQ_DOMAIN_MAP_NOMAP:
-                data = irq_get_irq_data(hwirq);
-                if (data && (data->domain == domain) && (data->hwirq == hwirq))
-                        return hwirq;
-                break;
-        }
-        return 0;
 }
-EXPORT_SYMBOL_GPL(irq_find_mapping);
+EXPORT_SYMBOL_GPL(irq_create_of_mapping);
 /**
- * irq_linear_revmap() - Find a linux irq from a hw irq number.
+ * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping()
- * @domain: domain owning this hardware interrupt
+ * @irq: linux irq number to be discarded
- * @hwirq: hardware irq number in that domain space
 *
- * This is a fast path that can be called directly by irq controller code to
+ * Calling this function indicates the caller no longer needs a reference to
- * save a handful of instructions.
+ * the linux irq number returned by a prior call to irq_create_of_mapping().
 */
-unsigned int irq_linear_revmap(struct irq_domain *domain,
+void irq_dispose_mapping(unsigned int irq)
-                               irq_hw_number_t hwirq)
 {
-        BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR);
+        /*
+         * nothing yet; will be filled when support for dynamic allocation of
-        /* Check revmap bounds; complain if exceeded */
+         * irq_descs is added to irq_domain
-        if (WARN_ON(hwirq >= domain->revmap_data.linear.size))
+         */
-                return 0;
-        return domain->revmap_data.linear.revmap[hwirq];
 }
-EXPORT_SYMBOL_GPL(irq_linear_revmap);
+EXPORT_SYMBOL_GPL(irq_dispose_mapping);
-#ifdef CONFIG_IRQ_DOMAIN_DEBUG
+int irq_domain_simple_dt_translate(struct irq_domain *d,
-static int virq_debug_show(struct seq_file *m, void *private)
+                            struct device_node *controller,
+                            const u32 *intspec, unsigned int intsize,
+                            unsigned long *out_hwirq, unsigned int *out_type)
 {
-        unsigned long flags;
+        if (d->of_node != controller)
-        struct irq_desc *desc;
+                return -EINVAL;
-        const char *p;
+        if (intsize < 1)
-        static const char none[] = "none";
+                return -EINVAL;
-        void *data;
-        int i;
-        seq_printf(m, "%-5s  %-7s  %-15s  %-*s  %s\n", "irq", "hwirq",
-                      "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
-                      "domain name");
-        for (i = 1; i < nr_irqs; i++) {
-                desc = irq_to_desc(i);
-                if (!desc)
-                        continue;
-                raw_spin_lock_irqsave(&desc->lock, flags);
-                if (desc->action && desc->action->handler) {
-                        struct irq_chip *chip;
-                        seq_printf(m, "%5d  ", i);
-                        seq_printf(m, "0x%05lx  ", desc->irq_data.hwirq);
-                        chip = irq_desc_get_chip(desc);
-                        if (chip && chip->name)
-                                p = chip->name;
-                        else
-                                p = none;
-                        seq_printf(m, "%-15s  ", p);
-                        data = irq_desc_get_chip_data(desc);
-                        seq_printf(m, data ? "0x%p  " : "  %p  ", data);
-                        if (desc->irq_data.domain)
-                                p = of_node_full_name(desc->irq_data.domain->of_node);
-                        else
-                                p = none;
-                        seq_printf(m, "%s\n", p);
-                }
-                raw_spin_unlock_irqrestore(&desc->lock, flags);
-        }
+        *out_hwirq = intspec[0];
+        *out_type = IRQ_TYPE_NONE;
+        if (intsize > 1)
+                *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
        return 0;
 }
-static int virq_debug_open(struct inode *inode, struct file *file)
+struct irq_domain_ops irq_domain_simple_ops = {
-{
+        .dt_translate = irq_domain_simple_dt_translate,
-        return single_open(file, virq_debug_show, inode->i_private);
-}
-static const struct file_operations virq_debug_fops = {
-        .open = virq_debug_open,
-        .read = seq_read,
-        .llseek = seq_lseek,
-        .release = single_release,
 };
+EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
-static int __init irq_debugfs_init(void)
-{
-        if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
-                                 NULL, &virq_debug_fops) == NULL)
-                return -ENOMEM;
-        return 0;
-}
-__initcall(irq_debugfs_init);
-#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
 /**
- * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
+ * irq_domain_create_simple() - Set up a 'simple' translation range
- *
- * Device Tree IRQ specifier translation function which works with one cell
- * bindings where the cell value maps directly to the hwirq number.
 */
-int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr,
+void irq_domain_add_simple(struct device_node *controller, int irq_base)
-                             const u32 *intspec, unsigned int intsize,
-                             unsigned long *out_hwirq, unsigned int *out_type)
 {
-        if (WARN_ON(intsize < 1))
+        struct irq_domain *domain;
-                return -EINVAL;
-        *out_hwirq = intspec[0];
-        *out_type = IRQ_TYPE_NONE;
-        return 0;
-}
-EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
-/**
+        domain = kzalloc(sizeof(*domain), GFP_KERNEL);
- * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
+        if (!domain) {
- *
+                WARN_ON(1);
- * Device Tree IRQ specifier translation function which works with two cell
+                return;
- * bindings where the cell values map directly to the hwirq number
+        }
- * and linux irq flags.
- */
-int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
-                        const u32 *intspec, unsigned int intsize,
-                        irq_hw_number_t *out_hwirq, unsigned int *out_type)
-{
-        if (WARN_ON(intsize < 2))
-                return -EINVAL;
-        *out_hwirq = intspec[0];
-        *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
-        return 0;
-}
-EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
-/**
+        domain->irq_base = irq_base;
- * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
+        domain->of_node = of_node_get(controller);
- *
+        domain->ops = &irq_domain_simple_ops;
- * Device Tree IRQ specifier translation function which works with either one
+        irq_domain_add(domain);
- * or two cell bindings where the cell values map directly to the hwirq number
- * and linux irq flags.
- *
- * Note: don't use this function unless your interrupt controller explicitly
- * supports both one and two cell bindings.  For the majority of controllers
- * the _onecell() or _twocell() variants above should be used.
- */
-int irq_domain_xlate_onetwocell(struct irq_domain *d,
-                                struct device_node *ctrlr,
-                                const u32 *intspec, unsigned int intsize,
-                                unsigned long *out_hwirq, unsigned int *out_type)
-{
-        if (WARN_ON(intsize < 1))
-                return -EINVAL;
-        *out_hwirq = intspec[0];
-        *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
-        return 0;
 }
-EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
+EXPORT_SYMBOL_GPL(irq_domain_add_simple);
-const struct irq_domain_ops irq_domain_simple_ops = {
-        .xlate = irq_domain_xlate_onetwocell,
-};
-EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
-#ifdef CONFIG_OF_IRQ
 void irq_domain_generate_simple(const struct of_device_id *match,
                                u64 phys_base, unsigned int irq_start)
 {
        struct device_node *node;
-        pr_debug("looking for phys_base=%llx, irq_start=%i\n",
+        pr_info("looking for phys_base=%llx, irq_start=%i\n",
                (unsigned long long) phys_base, (int) irq_start);
        node = of_find_matching_node_by_address(NULL, match, phys_base);
        if (node)
-                irq_domain_add_legacy(node, 32, irq_start, 0,
+                irq_domain_add_simple(node, irq_start);
-                                      &irq_domain_simple_ops, NULL);
+        else
+                pr_info("no node found\n");
 }
 EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
-#endif
+#endif /* CONFIG_OF_IRQ */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e49a288fa47..d6c4adc2804 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -7,8 +7,6 @@
 * This file contains driver APIs to the irq subsystem.
 */
-#define pr_fmt(fmt) "genirq: " fmt
 #include <linux/irq.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
@@ -16,7 +14,6 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
-#include <linux/task_work.h>
 #include "internals.h"
@@ -142,25 +139,6 @@ static inline void
 irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
 #endif
-int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
-                        bool force)
-{
-        struct irq_desc *desc = irq_data_to_desc(data);
-        struct irq_chip *chip = irq_data_get_irq_chip(data);
-        int ret;
-        ret = chip->irq_set_affinity(data, mask, false);
-        switch (ret) {
-        case IRQ_SET_MASK_OK:
-                cpumask_copy(data->affinity, mask);
-        case IRQ_SET_MASK_OK_NOCOPY:
-                irq_set_thread_affinity(desc);
-                ret = 0;
-        }
-        return ret;
-}
 int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
 {
        struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -171,7 +149,14 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
                return -EINVAL;
        if (irq_can_move_pcntxt(data)) {
-                ret = irq_do_set_affinity(data, mask, false);
+                ret = chip->irq_set_affinity(data, mask, false);
+                switch (ret) {
+                case IRQ_SET_MASK_OK:
+                        cpumask_copy(data->affinity, mask);
+                case IRQ_SET_MASK_OK_NOCOPY:
+                        irq_set_thread_affinity(desc);
+                        ret = 0;
+                }
        } else {
                irqd_set_move_pending(data);
                irq_copy_pending(desc, mask);
@@ -210,7 +195,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
        if (!desc)
                return -EINVAL;
@@ -295,8 +280,9 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 static int
 setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
+        struct irq_chip *chip = irq_desc_get_chip(desc);
        struct cpumask *set = irq_default_affinity;
-        int node = desc->irq_data.node;
+        int ret;
        /* Excludes PER_CPU and NO_BALANCE interrupts */
        if (!irq_can_set_affinity(irq))
@@ -315,14 +301,13 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
        }
        cpumask_and(mask, cpu_online_mask, set);
-        if (node != NUMA_NO_NODE) {
+        ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
-                const struct cpumask *nodemask = cpumask_of_node(node);
+        switch (ret) {
+        case IRQ_SET_MASK_OK:
-                /* make sure at least one of the cpus in nodemask is online */
+                cpumask_copy(desc->irq_data.affinity, mask);
-                if (cpumask_intersects(mask, nodemask))
+        case IRQ_SET_MASK_OK_NOCOPY:
-                        cpumask_and(mask, mask, nodemask);
+                irq_set_thread_affinity(desc);
        }
-        irq_do_set_affinity(&desc->irq_data, mask, false);
        return 0;
 }
 #else
@@ -371,7 +356,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 static int __disable_irq_nosync(unsigned int irq)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
        if (!desc)
                return -EINVAL;
@@ -463,7 +448,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 void enable_irq(unsigned int irq)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
        if (!desc)
                return;
@@ -482,9 +467,6 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
        struct irq_desc *desc = irq_to_desc(irq);
        int ret = -ENXIO;
-        if (irq_desc_get_chip(desc)->flags &  IRQCHIP_SKIP_SET_WAKE)
-                return 0;
        if (desc->irq_data.chip->irq_set_wake)
                ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
@@ -506,7 +488,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
 int irq_set_irq_wake(unsigned int irq, unsigned int on)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
        int ret = 0;
        if (!desc)
@@ -547,7 +529,7 @@ EXPORT_SYMBOL(irq_set_irq_wake);
 int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
        int canrequest = 0;
        if (!desc)
@@ -574,7 +556,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                 * flow-types?
                 */
                pr_debug("No set_type function for IRQ %d (%s)\n", irq,
-                         chip ? (chip->name ? : "unknown") : "unknown");
+                                chip ? (chip->name ? : "unknown") : "unknown");
                return 0;
        }
@@ -608,7 +590,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                ret = 0;
                break;
        default:
-                pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
+                pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
                       flags, irq, chip->irq_set_type);
        }
        if (unmask)
@@ -616,22 +598,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
        return ret;
 }
-#ifdef CONFIG_HARDIRQS_SW_RESEND
-int irq_set_parent(int irq, int parent_irq)
-{
-        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-        if (!desc)
-                return -EINVAL;
-        desc->parent_irq = parent_irq;
-        irq_put_desc_unlock(desc, flags);
-        return 0;
-}
-#endif
 /*
 * Default primary interrupt handler for threaded interrupts. Is
 * assigned as primary handler when request_threaded_irq is called
@@ -676,7 +642,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 * is marked MASKED.
 */
 static void irq_finalize_oneshot(struct irq_desc *desc,
-                                 struct irqaction *action)
+                                 struct irqaction *action, bool force)
 {
        if (!(desc->istate & IRQS_ONESHOT))
                return;
@@ -710,7 +676,7 @@ again:
         * we would clear the threads_oneshot bit of this thread which
         * was just set.
         */
-        if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+        if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
                goto out_unlock;
        desc->threads_oneshot &= ~action->thread_mask;
@@ -732,7 +698,6 @@ static void
 irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
 {
        cpumask_var_t mask;
-        bool valid = true;
        if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
                return;
@@ -747,18 +712,10 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
        }
        raw_spin_lock_irq(&desc->lock);
-        /*
+        cpumask_copy(mask, desc->irq_data.affinity);
-         * This code is triggered unconditionally. Check the affinity
-         * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
-         */
-        if (desc->irq_data.affinity)
-                cpumask_copy(mask, desc->irq_data.affinity);
-        else
-                valid = false;
        raw_spin_unlock_irq(&desc->lock);
-        if (valid)
+        set_cpus_allowed_ptr(current, mask);
-                set_cpus_allowed_ptr(current, mask);
        free_cpumask_var(mask);
 }
 #else
@@ -779,7 +736,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
        local_bh_disable();
        ret = action->thread_fn(action->irq, action->dev_id);
-        irq_finalize_oneshot(desc, action);
+        irq_finalize_oneshot(desc, action, false);
        local_bh_enable();
        return ret;
 }
@@ -795,50 +752,15 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
        irqreturn_t ret;
        ret = action->thread_fn(action->irq, action->dev_id);
-        irq_finalize_oneshot(desc, action);
+        irq_finalize_oneshot(desc, action, false);
        return ret;
 }
-static void wake_threads_waitq(struct irq_desc *desc)
-{
-        if (atomic_dec_and_test(&desc->threads_active) &&
-            waitqueue_active(&desc->wait_for_threads))
-                wake_up(&desc->wait_for_threads);
-}
-static void irq_thread_dtor(struct callback_head *unused)
-{
-        struct task_struct *tsk = current;
-        struct irq_desc *desc;
-        struct irqaction *action;
-        if (WARN_ON_ONCE(!(current->flags & PF_EXITING)))
-                return;
-        action = kthread_data(tsk);
-        pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
-               tsk->comm, tsk->pid, action->irq);
-        desc = irq_to_desc(action->irq);
-        /*
-         * If IRQTF_RUNTHREAD is set, we need to decrement
-         * desc->threads_active and wake possible waiters.
-         */
-        if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
-                wake_threads_waitq(desc);
-        /* Prevent a stale desc->threads_oneshot */
-        irq_finalize_oneshot(desc, action);
-}
 /*
 * Interrupt handler thread
 */
 static int irq_thread(void *data)
 {
-        struct callback_head on_exit_work;
        static const struct sched_param param = {
                .sched_priority = MAX_USER_RT_PRIO/2,
        };
@@ -846,45 +768,90 @@ static int irq_thread(void *data)
        struct irq_desc *desc = irq_to_desc(action->irq);
        irqreturn_t (*handler_fn)(struct irq_desc *desc,
                        struct irqaction *action);
+        int wake;
-        if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
+        if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
                                        &action->thread_flags))
                handler_fn = irq_forced_thread_fn;
        else
                handler_fn = irq_thread_fn;
        sched_setscheduler(current, SCHED_FIFO, &param);
+        current->irqaction = action;
-        init_task_work(&on_exit_work, irq_thread_dtor);
-        task_work_add(current, &on_exit_work, false);
-        irq_thread_check_affinity(desc, action);
        while (!irq_wait_for_interrupt(action)) {
-                irqreturn_t action_ret;
                irq_thread_check_affinity(desc, action);
-                action_ret = handler_fn(desc, action);
+                atomic_inc(&desc->threads_active);
-                if (!noirqdebug)
-                        note_interrupt(action->irq, desc, action_ret);
-                wake_threads_waitq(desc);
+                raw_spin_lock_irq(&desc->lock);
+                if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
+                        /*
+                         * CHECKME: We might need a dedicated
+                         * IRQ_THREAD_PENDING flag here, which
+                         * retriggers the thread in check_irq_resend()
+                         * but AFAICT IRQS_PENDING should be fine as it
+                         * retriggers the interrupt itself --- tglx
+                         */
+                        desc->istate |= IRQS_PENDING;
+                        raw_spin_unlock_irq(&desc->lock);
+                } else {
+                        irqreturn_t action_ret;
+                        raw_spin_unlock_irq(&desc->lock);
+                        action_ret = handler_fn(desc, action);
+                        if (!noirqdebug)
+                                note_interrupt(action->irq, desc, action_ret);
+                }
+                wake = atomic_dec_and_test(&desc->threads_active);
+                if (wake && waitqueue_active(&desc->wait_for_threads))
+                        wake_up(&desc->wait_for_threads);
        }
+        /* Prevent a stale desc->threads_oneshot */
+        irq_finalize_oneshot(desc, action, true);
        /*
-         * This is the regular exit path. __free_irq() is stopping the
+         * Clear irqaction. Otherwise exit_irq_thread() would make
-         * thread via kthread_stop() after calling
+         * fuzz about an active irq thread going into nirvana.
-         * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the
-         * oneshot mask bit can be set. We cannot verify that as we
-         * cannot touch the oneshot mask at this point anymore as
-         * __setup_irq() might have given out currents thread_mask
-         * again.
         */
-        task_work_cancel(current, irq_thread_dtor);
+        current->irqaction = NULL;
        return 0;
 }
+/*
+ * Called from do_exit()
+ */
+void exit_irq_thread(void)
+{
+        struct task_struct *tsk = current;
+        struct irq_desc *desc;
+        if (!tsk->irqaction)
+                return;
+        printk(KERN_ERR
+               "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+               tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
+        desc = irq_to_desc(tsk->irqaction->irq);
+        /*
+         * Prevent a stale desc->threads_oneshot. Must be called
+         * before setting the IRQTF_DIED flag.
+         */
+        irq_finalize_oneshot(desc, tsk->irqaction, true);
+        /*
+         * Set the THREAD DIED flag to prevent further wakeups of the
+         * soon to be gone threaded handler.
+         */
+        set_bit(IRQTF_DIED, &tsk->irqaction->flags);
+}
 static void irq_setup_forced_threading(struct irqaction *new)
 {
        if (!force_irqthreads)
@@ -909,6 +876,7 @@ static int
 __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 {
        struct irqaction *old, **old_ptr;
+        const char *old_name = NULL;
        unsigned long flags, thread_mask = 0;
        int ret, nested, shared = 0;
        cpumask_var_t mask;
@@ -920,6 +888,22 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                return -ENOSYS;
        if (!try_module_get(desc->owner))
                return -ENODEV;
+        /*
+         * Some drivers like serial.c use request_irq() heavily,
+         * so we have to be careful not to interfere with a
+         * running system.
+         */
+        if (new->flags & IRQF_SAMPLE_RANDOM) {
+                /*
+                 * This function might sleep, we want to call it first,
+                 * outside of the atomic block.
+                 * Yes, this might clear the entropy pool if the wrong
+                 * driver is attempted to be loaded, without actually
+                 * installing a new handler, but is this really a problem,
+                 * only the sysadmin is able to do this.
+                 */
+                rand_initialize_irq(irq);
+        }
        /*
         * Check whether the interrupt nests into another interrupt
@@ -963,16 +947,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 */
                get_task_struct(t);
                new->thread = t;
-                /*
-                 * Tell the thread to set its affinity. This is
-                 * important for shared interrupt handlers as we do
-                 * not invoke setup_affinity() for the secondary
-                 * handlers as everything is already set up. Even for
-                 * interrupts marked with IRQF_NO_BALANCE this is
-                 * correct as we want the thread to move to the cpu(s)
-                 * on which the requesting code placed the interrupt.
-                 */
-                set_bit(IRQTF_AFFINITY, &new->thread_flags);
        }
        if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -981,18 +955,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        }
        /*
-         * Drivers are often written to work w/o knowledge about the
-         * underlying irq chip implementation, so a request for a
-         * threaded irq without a primary hard irq context handler
-         * requires the ONESHOT flag to be set. Some irq chips like
-         * MSI based interrupts are per se one shot safe. Check the
-         * chip flags, so we can avoid the unmask dance at the end of
-         * the threaded handler for those.
-         */
-        if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
-                new->flags &= ~IRQF_ONESHOT;
-        /*
         * The following block of code has to be executed atomically
         */
        raw_spin_lock_irqsave(&desc->lock, flags);
@@ -1008,8 +970,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 */
                if (!((old->flags & new->flags) & IRQF_SHARED) ||
                    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
-                    ((old->flags ^ new->flags) & IRQF_ONESHOT))
+                    ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
+                        old_name = old->name;
                        goto mismatch;
+                }
                /* All handlers must agree on per-cpuness */
                if ((old->flags & IRQF_PERCPU) !=
@@ -1018,11 +982,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                /* add new interrupt at end of irq queue */
                do {
-                        /*
-                         * Or all existing action->thread_mask bits,
-                         * so we can find the next zero bit for this
-                         * new action.
-                         */
                        thread_mask |= old->thread_mask;
                        old_ptr = &old->next;
                        old = *old_ptr;
@@ -1031,63 +990,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        }
        /*
-         * Setup the thread mask for this irqaction for ONESHOT. For
+         * Setup the thread mask for this irqaction. Unlikely to have
-         * !ONESHOT irqs the thread mask is 0 so we can avoid a
+         * 32 resp 64 irqs sharing one line, but who knows.
-         * conditional in irq_wake_thread().
         */
-        if (new->flags & IRQF_ONESHOT) {
+        if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
-                /*
+                ret = -EBUSY;
-                 * Unlikely to have 32 resp 64 irqs sharing one line,
-                 * but who knows.
-                 */
-                if (thread_mask == ~0UL) {
-                        ret = -EBUSY;
-                        goto out_mask;
-                }
-                /*
-                 * The thread_mask for the action is or'ed to
-                 * desc->thread_active to indicate that the
-                 * IRQF_ONESHOT thread handler has been woken, but not
-                 * yet finished. The bit is cleared when a thread
-                 * completes. When all threads of a shared interrupt
-                 * line have completed desc->threads_active becomes
-                 * zero and the interrupt line is unmasked. See
-                 * handle.c:irq_wake_thread() for further information.
-                 *
-                 * If no thread is woken by primary (hard irq context)
-                 * interrupt handlers, then desc->threads_active is
-                 * also checked for zero to unmask the irq line in the
-                 * affected hard irq flow handlers
-                 * (handle_[fasteoi|level]_irq).
-                 *
-                 * The new action gets the first zero bit of
-                 * thread_mask assigned. See the loop above which or's
-                 * all existing action->thread_mask bits.
-                 */
-                new->thread_mask = 1 << ffz(thread_mask);
-        } else if (new->handler == irq_default_primary_handler &&
-                   !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
-                /*
-                 * The interrupt was requested with handler = NULL, so
-                 * we use the default primary handler for it. But it
-                 * does not have the oneshot flag set. In combination
-                 * with level interrupts this is deadly, because the
-                 * default primary handler just wakes the thread, then
-                 * the irq lines is reenabled, but the device still
-                 * has the level irq asserted. Rinse and repeat....
-                 *
-                 * While this works for edge type interrupts, we play
-                 * it safe and reject unconditionally because we can't
-                 * say for sure which type this interrupt really
-                 * has. The type flags are unreliable as the
-                 * underlying chip implementation can override them.
-                 */
-                pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
-                       irq);
-                ret = -EINVAL;
                goto out_mask;
        }
+        new->thread_mask = 1 << ffz(thread_mask);
        if (!shared) {
                init_waitqueue_head(&desc->wait_for_threads);
@@ -1114,7 +1024,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                        desc->istate |= IRQS_ONESHOT;
                if (irq_settings_can_autoenable(desc))
-                        irq_startup(desc, true);
+                        irq_startup(desc);
                else
                        /* Undo nested disables: */
                        desc->depth = 1;
@@ -1134,7 +1044,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                if (nmsk != omsk)
                        /* hope the handler works with current  trigger mode */
-                        pr_warning("irq %d uses trigger mode %u; requested %u\n",
+                        pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
                                   irq, nmsk, omsk);
        }
@@ -1171,13 +1081,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        return 0;
 mismatch:
-        if (!(new->flags & IRQF_PROBE_SHARED)) {
-                pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
-                       irq, new->flags, new->name, old->flags, old->name);
 #ifdef CONFIG_DEBUG_SHIRQ
+        if (!(new->flags & IRQF_PROBE_SHARED)) {
+                printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
+                if (old_name)
+                        printk(KERN_ERR "current handler: %s\n", old_name);
                dump_stack();
-#endif
        }
+#endif
        ret = -EBUSY;
 out_mask:
@@ -1189,7 +1100,8 @@ out_thread:
                struct task_struct *t = new->thread;
                new->thread = NULL;
-                kthread_stop(t);
+                if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
+                        kthread_stop(t);
                put_task_struct(t);
        }
 out_mput:
@@ -1209,8 +1121,6 @@ int setup_irq(unsigned int irq, struct irqaction *act)
        int retval;
        struct irq_desc *desc = irq_to_desc(irq);
-        if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
-                return -EINVAL;
        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, act);
        chip_bus_sync_unlock(desc);
@@ -1219,7 +1129,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 }
 EXPORT_SYMBOL_GPL(setup_irq);
-/*
+ /*
 * Internal function to unregister an irqaction - used to free
 * regular and special interrupts that are part of the architecture.
 */
@@ -1259,6 +1169,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
        /* Found it - now remove it from the list of entries: */
        *action_ptr = action->next;
+        /* Currently used only by UML, might disappear one day: */
+#ifdef CONFIG_IRQ_RELEASE_METHOD
+        if (desc->irq_data.chip->release)
+                desc->irq_data.chip->release(irq, dev_id);
+#endif
        /* If this was the last handler, shut down the IRQ line: */
        if (!desc->action)
                irq_shutdown(desc);
@@ -1293,7 +1209,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 #endif
        if (action->thread) {
-                kthread_stop(action->thread);
+                if (!test_bit(IRQTF_DIED, &action->thread_flags))
+                        kthread_stop(action->thread);
                put_task_struct(action->thread);
        }
@@ -1310,10 +1227,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 */
 void remove_irq(unsigned int irq, struct irqaction *act)
 {
-        struct irq_desc *desc = irq_to_desc(irq);
+        __free_irq(irq, act->dev_id);
-        if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
-            __free_irq(irq, act->dev_id);
 }
 EXPORT_SYMBOL_GPL(remove_irq);
@@ -1335,7 +1249,7 @@ void free_irq(unsigned int irq, void *dev_id)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+        if (!desc)
                return;
 #ifdef CONFIG_SMP
@@ -1370,7 +1284,7 @@ EXPORT_SYMBOL(free_irq);
 *      and to set up the interrupt handler in the right order.
 *
 *      If you want to set up a threaded irq handler for your device
- *      then you need to supply @handler and @thread_fn. @handler is
+ *      then you need to supply @handler and @thread_fn. @handler ist
 *      still called in hard interrupt context and has to check
 *      whether the interrupt originates from the device. If yes it
 *      needs to disable the interrupt on the device and return
@@ -1388,6 +1302,7 @@ EXPORT_SYMBOL(free_irq);
 *      Flags:
 *
 *      IRQF_SHARED             Interrupt is shared
+ *      IRQF_SAMPLE_RANDOM      The interrupt can be used for entropy
 *      IRQF_TRIGGER_*          Specify active edge(s) or level
 *
 */
@@ -1412,8 +1327,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        if (!desc)
                return -EINVAL;
-        if (!irq_settings_can_request(desc) ||
+        if (!irq_settings_can_request(desc))
-            WARN_ON(irq_settings_is_per_cpu_devid(desc)))
                return -EINVAL;
        if (!handler) {
@@ -1498,194 +1412,3 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
        return !ret ? IRQC_IS_HARDIRQ : ret;
 }
 EXPORT_SYMBOL_GPL(request_any_context_irq);
-void enable_percpu_irq(unsigned int irq, unsigned int type)
-{
-        unsigned int cpu = smp_processor_id();
-        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
-        if (!desc)
-                return;
-        type &= IRQ_TYPE_SENSE_MASK;
-        if (type != IRQ_TYPE_NONE) {
-                int ret;
-                ret = __irq_set_trigger(desc, irq, type);
-                if (ret) {
-                        WARN(1, "failed to set type for IRQ%d\n", irq);
-                        goto out;
-                }
-        }
-        irq_percpu_enable(desc, cpu);
-out:
-        irq_put_desc_unlock(desc, flags);
-}
-void disable_percpu_irq(unsigned int irq)
-{
-        unsigned int cpu = smp_processor_id();
-        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
-        if (!desc)
-                return;
-        irq_percpu_disable(desc, cpu);
-        irq_put_desc_unlock(desc, flags);
-}
-/*
- * Internal function to unregister a percpu irqaction.
- */
-static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        struct irqaction *action;
-        unsigned long flags;
-        WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
-        if (!desc)
-                return NULL;
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        action = desc->action;
-        if (!action || action->percpu_dev_id != dev_id) {
-                WARN(1, "Trying to free already-free IRQ %d\n", irq);
-                goto bad;
-        }
-        if (!cpumask_empty(desc->percpu_enabled)) {
-                WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
-                     irq, cpumask_first(desc->percpu_enabled));
-                goto bad;
-        }
-        /* Found it - now remove it from the list of entries: */
-        desc->action = NULL;
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        unregister_handler_proc(irq, action);
-        module_put(desc->owner);
-        return action;
-bad:
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        return NULL;
-}
-/**
- *      remove_percpu_irq - free a per-cpu interrupt
- *      @irq: Interrupt line to free
- *      @act: irqaction for the interrupt
- *
- * Used to remove interrupts statically setup by the early boot process.
- */
-void remove_percpu_irq(unsigned int irq, struct irqaction *act)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        if (desc && irq_settings_is_per_cpu_devid(desc))
-            __free_percpu_irq(irq, act->percpu_dev_id);
-}
-/**
- *      free_percpu_irq - free an interrupt allocated with request_percpu_irq
- *      @irq: Interrupt line to free
- *      @dev_id: Device identity to free
- *
- *      Remove a percpu interrupt handler. The handler is removed, but
- *      the interrupt line is not disabled. This must be done on each
- *      CPU before calling this function. The function does not return
- *      until any executing interrupts for this IRQ have completed.
- *
- *      This function must not be called from interrupt context.
- */
-void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        if (!desc || !irq_settings_is_per_cpu_devid(desc))
-                return;
-        chip_bus_lock(desc);
-        kfree(__free_percpu_irq(irq, dev_id));
-        chip_bus_sync_unlock(desc);
-}
-/**
- *      setup_percpu_irq - setup a per-cpu interrupt
- *      @irq: Interrupt line to setup
- *      @act: irqaction for the interrupt
- *
- * Used to statically setup per-cpu interrupts in the early boot process.
- */
-int setup_percpu_irq(unsigned int irq, struct irqaction *act)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        int retval;
-        if (!desc || !irq_settings_is_per_cpu_devid(desc))
-                return -EINVAL;
-        chip_bus_lock(desc);
-        retval = __setup_irq(irq, desc, act);
-        chip_bus_sync_unlock(desc);
-        return retval;
-}
-/**
- *      request_percpu_irq - allocate a percpu interrupt line
- *      @irq: Interrupt line to allocate
- *      @handler: Function to be called when the IRQ occurs.
- *      @devname: An ascii name for the claiming device
- *      @dev_id: A percpu cookie passed back to the handler function
- *
- *      This call allocates interrupt resources, but doesn't
- *      automatically enable the interrupt. It has to be done on each
- *      CPU using enable_percpu_irq().
- *
- *      Dev_id must be globally unique. It is a per-cpu variable, and
- *      the handler gets called with the interrupted CPU's instance of
- *      that variable.
- */
-int request_percpu_irq(unsigned int irq, irq_handler_t handler,
-                       const char *devname, void __percpu *dev_id)
-{
-        struct irqaction *action;
-        struct irq_desc *desc;
-        int retval;
-        if (!dev_id)
-                return -EINVAL;
-        desc = irq_to_desc(irq);
-        if (!desc || !irq_settings_can_request(desc) ||
-            !irq_settings_is_per_cpu_devid(desc))
-                return -EINVAL;
-        action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
-        if (!action)
-                return -ENOMEM;
-        action->handler = handler;
-        action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
-        action->name = devname;
-        action->percpu_dev_id = dev_id;
-        chip_bus_lock(desc);
-        retval = __setup_irq(irq, desc, action);
-        chip_bus_sync_unlock(desc);
-        if (retval)
-                kfree(action);
-        return retval;
-}
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index ca3f4aaff70..47420908fba 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -42,8 +42,13 @@ void irq_move_masked_irq(struct irq_data *idata)
         * For correct operation this depends on the caller
         * masking the irqs.
         */
-        if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
+        if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
-                irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
+                   < nr_cpu_ids))
+                if (!chip->irq_set_affinity(&desc->irq_data,
+                                            desc->pending_mask, false)) {
+                        cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
+                        irq_set_thread_affinity(desc);
+                }
        cpumask_clear(desc->pending_mask);
 }
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cb228bf2176..fe4b09cf829 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -103,14 +103,14 @@ int check_wakeup_irqs(void)
        int irq;
        for_each_irq_desc(irq, desc) {
-                /*
-                 * Only interrupts which are marked as wakeup source
-                 * and have not been disabled before the suspend check
-                 * can abort suspend.
-                 */
                if (irqd_is_wakeup_set(&desc->irq_data)) {
-                        if (desc->depth == 1 && desc->istate & IRQS_PENDING)
+                        if (desc->istate & IRQS_PENDING) {
+                                pr_info("Wakeup IRQ %d %s pending, suspend aborted\n",
+                                        irq,
+                                        desc->action && desc->action->name ?
+                                        desc->action->name : "");
                                return -EBUSY;
+                        }
                        continue;
                }
                /*
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 9065107f083..ef60772d2fe 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -55,33 +55,23 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
 */
 void check_irq_resend(struct irq_desc *desc, unsigned int irq)
 {
-        /*
-         * We do not resend level type interrupts. Level type
-         * interrupts are resent by hardware when they are still
-         * active. Clear the pending bit so suspend/resume does not
-         * get confused.
-         */
-        if (irq_settings_is_level(desc)) {
-                desc->istate &= ~IRQS_PENDING;
-                return;
-        }
-        if (desc->istate & IRQS_REPLAY)
-                return;
        if (desc->istate & IRQS_PENDING) {
                desc->istate &= ~IRQS_PENDING;
+                /*
+                 * We do not resend level type interrupts. Level type
+                 * interrupts are resent by hardware when they are still
+                 * active.
+                 */
+                if (irq_settings_is_level(desc))
+                        return;
+                if (desc->istate & IRQS_REPLAY)
+                        return;
                desc->istate |= IRQS_REPLAY;
                if (!desc->irq_data.chip->irq_retrigger ||
                    !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
 #ifdef CONFIG_HARDIRQS_SW_RESEND
-                        /*
-                         * If the interrupt has a parent irq and runs
-                         * in the thread context of the parent irq,
-                         * retrigger the parent.
-                         */
-                        if (desc->parent_irq &&
-                            irq_settings_is_nested_thread(desc))
-                                irq = desc->parent_irq;
                        /* Set it pending and activate the softirq: */
                        set_bit(irq, irqs_resend);
                        tasklet_schedule(&resend_tasklet);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 1162f1030f1..f1667833d44 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -13,7 +13,6 @@ enum {
        _IRQ_MOVE_PCNTXT        = IRQ_MOVE_PCNTXT,
        _IRQ_NO_BALANCING       = IRQ_NO_BALANCING,
        _IRQ_NESTED_THREAD      = IRQ_NESTED_THREAD,
-        _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
        _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
 };
@@ -25,7 +24,6 @@ enum {
 #define IRQ_NOTHREAD            GOT_YOU_MORON
 #define IRQ_NOAUTOEN            GOT_YOU_MORON
 #define IRQ_NESTED_THREAD       GOT_YOU_MORON
-#define IRQ_PER_CPU_DEVID       GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK        GOT_YOU_MORON
@@ -41,11 +39,6 @@ static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
        return desc->status_use_accessors & _IRQ_PER_CPU;
 }
-static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc)
-{
-        return desc->status_use_accessors & _IRQ_PER_CPU_DEVID;
-}
 static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
 {
        desc->status_use_accessors |= _IRQ_PER_CPU;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 611cd6003c4..dc813a948be 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -325,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
        desc->irqs_unhandled = 0;
 }
-bool noirqdebug __read_mostly;
+int noirqdebug __read_mostly;
 int noirqdebug_setup(char *str)
 {
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 1588e3b2871..c58fa7da8ae 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -5,14 +5,10 @@
 * context. The enqueueing is NMI-safe.
 */
-#include <linux/bug.h>
 #include <linux/kernel.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/irq_work.h>
-#include <linux/percpu.h>
 #include <linux/hardirq.h>
-#include <linux/irqflags.h>
-#include <asm/processor.h>
 /*
 * An entry can be in one of four states:
@@ -21,34 +17,54 @@
 * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
 * pending   next, 3 -> {busy}          : queued, pending callback
 * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
+ *
+ * We use the lower two bits of the next pointer to keep PENDING and BUSY
+ * flags.
 */
 #define IRQ_WORK_PENDING        1UL
 #define IRQ_WORK_BUSY           2UL
 #define IRQ_WORK_FLAGS          3UL
-static DEFINE_PER_CPU(struct llist_head, irq_work_list);
+static inline bool irq_work_is_set(struct irq_work *entry, int flags)
+{
+        return (unsigned long)entry->next & flags;
+}
+static inline struct irq_work *irq_work_next(struct irq_work *entry)
+{
+        unsigned long next = (unsigned long)entry->next;
+        next &= ~IRQ_WORK_FLAGS;
+        return (struct irq_work *)next;
+}
+static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
+{
+        unsigned long next = (unsigned long)entry;
+        next |= flags;
+        return (struct irq_work *)next;
+}
+static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
 /*
 * Claim the entry so that no one else will poke at it.
 */
-static bool irq_work_claim(struct irq_work *work)
+static bool irq_work_claim(struct irq_work *entry)
 {
-        unsigned long flags, nflags;
+        struct irq_work *next, *nflags;
-        for (;;) {
+        do {
-                flags = work->flags;
+                next = entry->next;
-                if (flags & IRQ_WORK_PENDING)
+                if ((unsigned long)next & IRQ_WORK_PENDING)
                        return false;
-                nflags = flags | IRQ_WORK_FLAGS;
+                nflags = next_flags(next, IRQ_WORK_FLAGS);
-                if (cmpxchg(&work->flags, flags, nflags) == flags)
+        } while (cmpxchg(&entry->next, next, nflags) != next);
-                        break;
-                cpu_relax();
-        }
        return true;
 }
 void __weak arch_irq_work_raise(void)
 {
        /*
@@ -59,15 +75,20 @@ void __weak arch_irq_work_raise(void)
 /*
 * Queue the entry and raise the IPI if needed.
 */
-static void __irq_work_queue(struct irq_work *work)
+static void __irq_work_queue(struct irq_work *entry)
 {
-        bool empty;
+        struct irq_work *next;
        preempt_disable();
-        empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
+        do {
+                next = __this_cpu_read(irq_work_list);
+                /* Can assign non-atomic because we keep the flags set. */
+                entry->next = next_flags(next, IRQ_WORK_FLAGS);
+        } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
        /* The list was empty, raise self-interrupt to start processing. */
-        if (empty)
+        if (!irq_work_next(entry))
                arch_irq_work_raise();
        preempt_enable();
@@ -79,16 +100,16 @@ static void __irq_work_queue(struct irq_work *work)
 *
 * Can be re-enqueued while the callback is still in progress.
 */
-bool irq_work_queue(struct irq_work *work)
+bool irq_work_queue(struct irq_work *entry)
 {
-        if (!irq_work_claim(work)) {
+        if (!irq_work_claim(entry)) {
                /*
                 * Already enqueued, can't do!
                 */
                return false;
        }
-        __irq_work_queue(work);
+        __irq_work_queue(entry);
        return true;
 }
 EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -99,34 +120,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
 */
 void irq_work_run(void)
 {
-        struct irq_work *work;
+        struct irq_work *list;
-        struct llist_head *this_list;
-        struct llist_node *llnode;
-        this_list = &__get_cpu_var(irq_work_list);
+        if (this_cpu_read(irq_work_list) == NULL)
-        if (llist_empty(this_list))
                return;
        BUG_ON(!in_irq());
        BUG_ON(!irqs_disabled());
-        llnode = llist_del_all(this_list);
+        list = this_cpu_xchg(irq_work_list, NULL);
-        while (llnode != NULL) {
-                work = llist_entry(llnode, struct irq_work, llnode);
+        while (list != NULL) {
+                struct irq_work *entry = list;
-                llnode = llist_next(llnode);
+                list = irq_work_next(list);
                /*
-                 * Clear the PENDING bit, after this point the @work
+                 * Clear the PENDING bit, after this point the @entry
                 * can be re-used.
                 */
-                work->flags = IRQ_WORK_BUSY;
+                entry->next = next_flags(NULL, IRQ_WORK_BUSY);
-                work->func(work);
+                entry->func(entry);
                /*
                 * Clear the BUSY bit and return to the free state if
                 * no-one else claimed it meanwhile.
                 */
-                (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
+                (void)cmpxchg(&entry->next,
+                              next_flags(NULL, IRQ_WORK_BUSY),
+                              NULL);
        }
 }
 EXPORT_SYMBOL_GPL(irq_work_run);
@@ -135,11 +156,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
 * Synchronize against the irq_work @entry, ensures the entry is not
 * currently in use.
 */
-void irq_work_sync(struct irq_work *work)
+void irq_work_sync(struct irq_work *entry)
 {
        WARN_ON_ONCE(irqs_disabled());
-        while (work->flags & IRQ_WORK_BUSY)
+        while (irq_work_is_set(entry, IRQ_WORK_BUSY))
                cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 8d262b46757..d802883153d 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        cval = it->expires;
        cinterval = it->incr;
-        if (cval) {
+        if (!cputime_eq(cval, cputime_zero)) {
                struct task_cputime cputime;
                cputime_t t;
                thread_group_cputimer(tsk, &cputime);
                if (clock_id == CPUCLOCK_PROF)
-                        t = cputime.utime + cputime.stime;
+                        t = cputime_add(cputime.utime, cputime.stime);
                else
                        /* CPUCLOCK_VIRT */
                        t = cputime.utime;
-                if (cval < t)
+                if (cputime_le(cval, t))
                        /* about to fire */
                        cval = cputime_one_jiffy;
                else
-                        cval = cval - t;
+                        cval = cputime_sub(cval, t);
        }
        spin_unlock_irq(&tsk->sighand->siglock);
@@ -161,9 +161,10 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        cval = it->expires;
        cinterval = it->incr;
-        if (cval || nval) {
+        if (!cputime_eq(cval, cputime_zero) ||
-                if (nval > 0)
+            !cputime_eq(nval, cputime_zero)) {
-                        nval += cputime_one_jiffy;
+                if (cputime_gt(nval, cputime_zero))
+                        nval = cputime_add(nval, cputime_one_jiffy);
                set_process_cpu_timer(tsk, clock_id, &nval, &cval);
        }
        it->expires = nval;
@@ -284,12 +285,8 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
        if (value) {
                if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
                        return -EFAULT;
-        } else {
+        } else
-                memset(&set_buffer, 0, sizeof(set_buffer));
+                memset((char *) &set_buffer, 0, sizeof(set_buffer));
-                printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
-                            " Misfeature support will be removed\n",
-                            current->comm);
-        }
        error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
        if (error || !ovalue)
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 60f48fa0fd0..e6f1f24ad57 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/sort.h>
 #include <linux/err.h>
-#include <linux/static_key.h>
+#include <linux/jump_label.h>
 #ifdef HAVE_JUMP_LABEL
@@ -29,6 +29,11 @@ void jump_label_unlock(void)
        mutex_unlock(&jump_label_mutex);
 }
+bool jump_label_enabled(struct jump_label_key *key)
+{
+        return !!atomic_read(&key->enabled);
+}
 static int jump_label_cmp(const void *a, const void *b)
 {
        const struct jump_entry *jea = a;
@@ -53,73 +58,29 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
        sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
 }
-static void jump_label_update(struct static_key *key, int enable);
+static void jump_label_update(struct jump_label_key *key, int enable);
-void static_key_slow_inc(struct static_key *key)
+void jump_label_inc(struct jump_label_key *key)
 {
        if (atomic_inc_not_zero(&key->enabled))
                return;
        jump_label_lock();
-        if (atomic_read(&key->enabled) == 0) {
+        if (atomic_read(&key->enabled) == 0)
-                if (!jump_label_get_branch_default(key))
+                jump_label_update(key, JUMP_LABEL_ENABLE);
-                        jump_label_update(key, JUMP_LABEL_ENABLE);
-                else
-                        jump_label_update(key, JUMP_LABEL_DISABLE);
-        }
        atomic_inc(&key->enabled);
        jump_label_unlock();
 }
-EXPORT_SYMBOL_GPL(static_key_slow_inc);
-static void __static_key_slow_dec(struct static_key *key,
+void jump_label_dec(struct jump_label_key *key)
-                unsigned long rate_limit, struct delayed_work *work)
 {
-        if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
+        if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
-                WARN(atomic_read(&key->enabled) < 0,
-                     "jump label: negative count!\n");
                return;
-        }
-        if (rate_limit) {
+        jump_label_update(key, JUMP_LABEL_DISABLE);
-                atomic_inc(&key->enabled);
-                schedule_delayed_work(work, rate_limit);
-        } else {
-                if (!jump_label_get_branch_default(key))
-                        jump_label_update(key, JUMP_LABEL_DISABLE);
-                else
-                        jump_label_update(key, JUMP_LABEL_ENABLE);
-        }
        jump_label_unlock();
 }
-static void jump_label_update_timeout(struct work_struct *work)
-{
-        struct static_key_deferred *key =
-                container_of(work, struct static_key_deferred, work.work);
-        __static_key_slow_dec(&key->key, 0, NULL);
-}
-void static_key_slow_dec(struct static_key *key)
-{
-        __static_key_slow_dec(key, 0, NULL);
-}
-EXPORT_SYMBOL_GPL(static_key_slow_dec);
-void static_key_slow_dec_deferred(struct static_key_deferred *key)
-{
-        __static_key_slow_dec(&key->key, key->timeout, &key->work);
-}
-EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
-void jump_label_rate_limit(struct static_key_deferred *key,
-                unsigned long rl)
-{
-        key->timeout = rl;
-        INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
-}
-EXPORT_SYMBOL_GPL(jump_label_rate_limit);
 static int addr_conflict(struct jump_entry *entry, void *start, void *end)
 {
        if (entry->code <= (unsigned long)end &&
@@ -144,19 +105,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
        return 0;
 }
-/* 
+static void __jump_label_update(struct jump_label_key *key,
- * Update code which is definitely not currently executing.
- * Architectures which need heavyweight synchronization to modify
- * running code can override this to make the non-live update case
- * cheaper.
- */
-void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
-                                            enum jump_label_type type)
-{
-        arch_jump_label_transform(entry, type); 
-}
-static void __jump_label_update(struct static_key *key,
                                struct jump_entry *entry,
                                struct jump_entry *stop, int enable)
 {
@@ -173,51 +122,45 @@ static void __jump_label_update(struct static_key *key,
        }
 }
-static enum jump_label_type jump_label_type(struct static_key *key)
+/*
+ * Not all archs need this.
+ */
+void __weak arch_jump_label_text_poke_early(jump_label_t addr)
 {
-        bool true_branch = jump_label_get_branch_default(key);
-        bool state = static_key_enabled(key);
-        if ((!true_branch && state) || (true_branch && !state))
-                return JUMP_LABEL_ENABLE;
-        return JUMP_LABEL_DISABLE;
 }
-void __init jump_label_init(void)
+static __init int jump_label_init(void)
 {
        struct jump_entry *iter_start = __start___jump_table;
        struct jump_entry *iter_stop = __stop___jump_table;
-        struct static_key *key = NULL;
+        struct jump_label_key *key = NULL;
        struct jump_entry *iter;
        jump_label_lock();
        jump_label_sort_entries(iter_start, iter_stop);
        for (iter = iter_start; iter < iter_stop; iter++) {
-                struct static_key *iterk;
+                arch_jump_label_text_poke_early(iter->code);
+                if (iter->key == (jump_label_t)(unsigned long)key)
-                iterk = (struct static_key *)(unsigned long)iter->key;
-                arch_jump_label_transform_static(iter, jump_label_type(iterk));
-                if (iterk == key)
                        continue;
-                key = iterk;
+                key = (struct jump_label_key *)(unsigned long)iter->key;
-                /*
+                atomic_set(&key->enabled, 0);
-                 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
+                key->entries = iter;
-                 */
-                *((unsigned long *)&key->entries) += (unsigned long)iter;
 #ifdef CONFIG_MODULES
                key->next = NULL;
 #endif
        }
        jump_label_unlock();
+        return 0;
 }
+early_initcall(jump_label_init);
 #ifdef CONFIG_MODULES
-struct static_key_mod {
+struct jump_label_mod {
-        struct static_key_mod *next;
+        struct jump_label_mod *next;
        struct jump_entry *entries;
        struct module *mod;
 };
@@ -237,9 +180,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
                                start, end);
 }
-static void __jump_label_mod_update(struct static_key *key, int enable)
+static void __jump_label_mod_update(struct jump_label_key *key, int enable)
 {
-        struct static_key_mod *mod = key->next;
+        struct jump_label_mod *mod = key->next;
        while (mod) {
                struct module *m = mod->mod;
@@ -269,9 +212,8 @@ void jump_label_apply_nops(struct module *mod)
        if (iter_start == iter_stop)
                return;
-        for (iter = iter_start; iter < iter_stop; iter++) {
+        for (iter = iter_start; iter < iter_stop; iter++)
-                arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+                arch_jump_label_text_poke_early(iter->code);
-        }
 }
 static int jump_label_add_module(struct module *mod)
@@ -279,8 +221,8 @@ static int jump_label_add_module(struct module *mod)
        struct jump_entry *iter_start = mod->jump_entries;
        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
        struct jump_entry *iter;
-        struct static_key *key = NULL;
+        struct jump_label_key *key = NULL;
-        struct static_key_mod *jlm;
+        struct jump_label_mod *jlm;
        /* if the module doesn't have jump label entries, just return */
        if (iter_start == iter_stop)
@@ -289,31 +231,30 @@ static int jump_label_add_module(struct module *mod)
        jump_label_sort_entries(iter_start, iter_stop);
        for (iter = iter_start; iter < iter_stop; iter++) {
-                struct static_key *iterk;
+                if (iter->key == (jump_label_t)(unsigned long)key)
-                iterk = (struct static_key *)(unsigned long)iter->key;
-                if (iterk == key)
                        continue;
-                key = iterk;
+                key = (struct jump_label_key *)(unsigned long)iter->key;
                if (__module_address(iter->key) == mod) {
-                        /*
+                        atomic_set(&key->enabled, 0);
-                         * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
+                        key->entries = iter;
-                         */
-                        *((unsigned long *)&key->entries) += (unsigned long)iter;
                        key->next = NULL;
                        continue;
                }
-                jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
+                jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
                if (!jlm)
                        return -ENOMEM;
                jlm->mod = mod;
                jlm->entries = iter;
                jlm->next = key->next;
                key->next = jlm;
-                if (jump_label_type(key) == JUMP_LABEL_ENABLE)
+                if (jump_label_enabled(key))
-                        __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
+                        __jump_label_update(key, iter, iter_stop,
+                                            JUMP_LABEL_ENABLE);
        }
        return 0;
@@ -324,14 +265,14 @@ static void jump_label_del_module(struct module *mod)
        struct jump_entry *iter_start = mod->jump_entries;
        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
        struct jump_entry *iter;
-        struct static_key *key = NULL;
+        struct jump_label_key *key = NULL;
-        struct static_key_mod *jlm, **prev;
+        struct jump_label_mod *jlm, **prev;
        for (iter = iter_start; iter < iter_stop; iter++) {
                if (iter->key == (jump_label_t)(unsigned long)key)
                        continue;
-                key = (struct static_key *)(unsigned long)iter->key;
+                key = (struct jump_label_key *)(unsigned long)iter->key;
                if (__module_address(iter->key) == mod)
                        continue;
@@ -433,13 +374,12 @@ int jump_label_text_reserved(void *start, void *end)
        return ret;
 }
-static void jump_label_update(struct static_key *key, int enable)
+static void jump_label_update(struct jump_label_key *key, int enable)
 {
-        struct jump_entry *stop = __stop___jump_table;
+        struct jump_entry *entry = key->entries, *stop = __stop___jump_table;
-        struct jump_entry *entry = jump_label_get_entries(key);
 #ifdef CONFIG_MODULES
-        struct module *mod = __module_address((unsigned long)key);
+        struct module *mod = __module_address((jump_label_t)key);
        __jump_label_mod_update(key, enable);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 2169feeba52..079f1d39a8b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -343,7 +343,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
 /* Look up a kernel symbol and return it in a text buffer. */
 static int __sprint_symbol(char *buffer, unsigned long address,
-                           int symbol_offset, int add_offset)
+                           int symbol_offset)
 {
        char *modname;
        const char *name;
@@ -358,13 +358,13 @@ static int __sprint_symbol(char *buffer, unsigned long address,
        if (name != buffer)
                strcpy(buffer, name);
        len = strlen(buffer);
+        buffer += len;
        offset -= symbol_offset;
-        if (add_offset)
-                len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
        if (modname)
-                len += sprintf(buffer + len, " [%s]", modname);
+                len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
+        else
+                len += sprintf(buffer, "+%#lx/%#lx", offset, size);
        return len;
 }
@@ -382,26 +382,10 @@ static int __sprint_symbol(char *buffer, unsigned long address,
 */
 int sprint_symbol(char *buffer, unsigned long address)
 {
-        return __sprint_symbol(buffer, address, 0, 1);
+        return __sprint_symbol(buffer, address, 0);
 }
-EXPORT_SYMBOL_GPL(sprint_symbol);
-/**
+EXPORT_SYMBOL_GPL(sprint_symbol);
- * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
- * @buffer: buffer to be stored
- * @address: address to lookup
- *
- * This function looks up a kernel symbol with @address and stores its name
- * and module name to @buffer if possible. If no symbol was found, just saves
- * its @address as is.
- *
- * This function returns the number of bytes stored in @buffer.
- */
-int sprint_symbol_no_offset(char *buffer, unsigned long address)
-{
-        return __sprint_symbol(buffer, address, 0, 0);
-}
-EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
 /**
 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
@@ -419,7 +403,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
 */
 int sprint_backtrace(char *buffer, unsigned long address)
 {
-        return __sprint_symbol(buffer, address, -1, 1);
+        return __sprint_symbol(buffer, address, -1);
 }
 /* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
deleted file mode 100644
index e30ac0fe61c..00000000000
--- a/kernel/kcmp.c
+++ /dev/null
@@ -1,197 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/syscalls.h>
-#include <linux/fdtable.h>
-#include <linux/string.h>
-#include <linux/random.h>
-#include <linux/module.h>
-#include <linux/ptrace.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/cache.h>
-#include <linux/bug.h>
-#include <linux/err.h>
-#include <linux/kcmp.h>
-#include <asm/unistd.h>
-/*
- * We don't expose the real in-memory order of objects for security reasons.
- * But still the comparison results should be suitable for sorting. So we
- * obfuscate kernel pointers values and compare the production instead.
- *
- * The obfuscation is done in two steps. First we xor the kernel pointer with
- * a random value, which puts pointer into a new position in a reordered space.
- * Secondly we multiply the xor production with a large odd random number to
- * permute its bits even more (the odd multiplier guarantees that the product
- * is unique ever after the high bits are truncated, since any odd number is
- * relative prime to 2^n).
- *
- * Note also that the obfuscation itself is invisible to userspace and if needed
- * it can be changed to an alternate scheme.
- */
-static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
-static long kptr_obfuscate(long v, int type)
-{
-        return (v ^ cookies[type][0]) * cookies[type][1];
-}
-/*
- * 0 - equal, i.e. v1 = v2
- * 1 - less than, i.e. v1 < v2
- * 2 - greater than, i.e. v1 > v2
- * 3 - not equal but ordering unavailable (reserved for future)
- */
-static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
-{
-        long ret;
-        ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
-        return (ret < 0) | ((ret > 0) << 1);
-}
-/* The caller must have pinned the task */
-static struct file *
-get_file_raw_ptr(struct task_struct *task, unsigned int idx)
-{
-        struct file *file = NULL;
-        task_lock(task);
-        rcu_read_lock();
-        if (task->files)
-                file = fcheck_files(task->files, idx);
-        rcu_read_unlock();
-        task_unlock(task);
-        return file;
-}
-static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
-{
-        if (likely(m2 != m1))
-                mutex_unlock(m2);
-        mutex_unlock(m1);
-}
-static int kcmp_lock(struct mutex *m1, struct mutex *m2)
-{
-        int err;
-        if (m2 > m1)
-                swap(m1, m2);
-        err = mutex_lock_killable(m1);
-        if (!err && likely(m1 != m2)) {
-                err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
-                if (err)
-                        mutex_unlock(m1);
-        }
-        return err;
-}
-SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
-                unsigned long, idx1, unsigned long, idx2)
-{
-        struct task_struct *task1, *task2;
-        int ret;
-        rcu_read_lock();
-        /*
-         * Tasks are looked up in caller's PID namespace only.
-         */
-        task1 = find_task_by_vpid(pid1);
-        task2 = find_task_by_vpid(pid2);
-        if (!task1 || !task2)
-                goto err_no_task;
-        get_task_struct(task1);
-        get_task_struct(task2);
-        rcu_read_unlock();
-        /*
-         * One should have enough rights to inspect task details.
-         */
-        ret = kcmp_lock(&task1->signal->cred_guard_mutex,
-                        &task2->signal->cred_guard_mutex);
-        if (ret)
-                goto err;
-        if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
-            !ptrace_may_access(task2, PTRACE_MODE_READ)) {
-                ret = -EPERM;
-                goto err_unlock;
-        }
-        switch (type) {
-        case KCMP_FILE: {
-                struct file *filp1, *filp2;
-                filp1 = get_file_raw_ptr(task1, idx1);
-                filp2 = get_file_raw_ptr(task2, idx2);
-                if (filp1 && filp2)
-                        ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
-                else
-                        ret = -EBADF;
-                break;
-        }
-        case KCMP_VM:
-                ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
-                break;
-        case KCMP_FILES:
-                ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
-                break;
-        case KCMP_FS:
-                ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
-                break;
-        case KCMP_SIGHAND:
-                ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
-                break;
-        case KCMP_IO:
-                ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
-                break;
-        case KCMP_SYSVSEM:
-#ifdef CONFIG_SYSVIPC
-                ret = kcmp_ptr(task1->sysvsem.undo_list,
-                               task2->sysvsem.undo_list,
-                               KCMP_SYSVSEM);
-#else
-                ret = -EOPNOTSUPP;
-#endif
-                break;
-        default:
-                ret = -EINVAL;
-                break;
-        }
-err_unlock:
-        kcmp_unlock(&task1->signal->cred_guard_mutex,
-                    &task2->signal->cred_guard_mutex);
-err:
-        put_task_struct(task1);
-        put_task_struct(task2);
-        return ret;
-err_no_task:
-        rcu_read_unlock();
-        return -ESRCH;
-}
-static __init int kcmp_cookies_init(void)
-{
-        int i;
-        get_random_bytes(cookies, sizeof(cookies));
-        for (i = 0; i < KCMP_TYPES; i++)
-                cookies[i][1] |= (~(~0UL >>  1) | 1);
-        return 0;
-}
-arch_initcall(kcmp_cookies_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5e4bd7864c5..296fbc84d65 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,6 +21,7 @@
 #include <linux/hardirq.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
+#include <generated/utsrelease.h>
 #include <linux/utsname.h>
 #include <linux/numa.h>
 #include <linux/suspend.h>
@@ -31,11 +32,13 @@
 #include <linux/console.h>
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
+#include <linux/kmsg_dump.h>
 #include <linux/syscore_ops.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
+#include <asm/system.h>
 #include <asm/sections.h>
 /* Per cpu memory for storing cpu states in case of system crash. */
@@ -495,7 +498,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
        while (hole_end <= crashk_res.end) {
                unsigned long i;
-                if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
+                if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
                        break;
                if (hole_end > crashk_res.end)
                        break;
@@ -996,7 +999,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
                        kimage_free(xchg(&kexec_crash_image, NULL));
                        result = kimage_crash_alloc(&image, entry,
                                                     nr_segments, segments);
-                        crash_map_reserved_pages();
                }
                if (result)
                        goto out;
@@ -1013,8 +1015,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
                                goto out;
                }
                kimage_terminate(image);
-                if (flags & KEXEC_ON_CRASH)
-                        crash_unmap_reserved_pages();
        }
        /* Install the new kernel, and  Uninstall the old */
        image = xchg(dest_image, image);
@@ -1026,18 +1026,6 @@ out:
        return result;
 }
-/*
- * Add and remove page tables for crashkernel memory
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak crash_map_reserved_pages(void)
-{}
-void __weak crash_unmap_reserved_pages(void)
-{}
 #ifdef CONFIG_COMPAT
 asmlinkage long compat_sys_kexec_load(unsigned long entry,
                                unsigned long nr_segments,
@@ -1091,6 +1079,8 @@ void crash_kexec(struct pt_regs *regs)
                if (kexec_crash_image) {
                        struct pt_regs fixed_regs;
+                        kmsg_dump(KMSG_DUMP_KEXEC);
                        crash_setup_regs(&fixed_regs, regs);
                        crash_save_vmcoreinfo();
                        machine_crash_shutdown(&fixed_regs);
@@ -1127,8 +1117,6 @@ int crash_shrink_memory(unsigned long new_size)
 {
        int ret = 0;
        unsigned long start, end;
-        unsigned long old_size;
-        struct resource *ram_res;
        mutex_lock(&kexec_mutex);
@@ -1138,37 +1126,23 @@ int crash_shrink_memory(unsigned long new_size)
        }
        start = crashk_res.start;
        end = crashk_res.end;
-        old_size = (end == 0) ? 0 : end - start + 1;
-        if (new_size >= old_size) {
-                ret = (new_size == old_size) ? 0 : -EINVAL;
-                goto unlock;
-        }
-        ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+        if (new_size >= end - start + 1) {
-        if (!ram_res) {
+                ret = -EINVAL;
-                ret = -ENOMEM;
+                if (new_size == end - start + 1)
+                        ret = 0;
                goto unlock;
        }
-        start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
+        start = roundup(start, PAGE_SIZE);
-        end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+        end = roundup(start + new_size, PAGE_SIZE);
-        crash_map_reserved_pages();
        crash_free_reserved_phys_range(end, crashk_res.end);
        if ((start == end) && (crashk_res.parent != NULL))
                release_resource(&crashk_res);
-        ram_res->start = end;
-        ram_res->end = crashk_res.end;
-        ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
-        ram_res->name = "System RAM";
        crashk_res.end = end - 1;
-        insert_resource(&iomem_resource, ram_res);
-        crash_unmap_reserved_pages();
 unlock:
        mutex_unlock(&kexec_mutex);
        return ret;
@@ -1357,10 +1331,6 @@ static int __init parse_crashkernel_simple(char 		*cmdline,
        if (*cur == '@')
                *crash_base = memparse(cur+1, &cur);
-        else if (*cur != ' ' && *cur != '\0') {
-                pr_warning("crashkernel: unrecognized char\n");
-                return -EINVAL;
-        }
        return 0;
 }
@@ -1410,21 +1380,22 @@ int __init parse_crashkernel(char 		 *cmdline,
 }
-static void update_vmcoreinfo_note(void)
+void crash_save_vmcoreinfo(void)
 {
-        u32 *buf = vmcoreinfo_note;
+        u32 *buf;
        if (!vmcoreinfo_size)
                return;
+        vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
+        buf = (u32 *)vmcoreinfo_note;
        buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
                              vmcoreinfo_size);
-        final_note(buf);
-}
-void crash_save_vmcoreinfo(void)
+        final_note(buf);
-{
-        vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
-        update_vmcoreinfo_note();
 }
 void vmcoreinfo_append_str(const char *fmt, ...)
@@ -1464,9 +1435,7 @@ static int __init crash_save_vmcoreinfo_init(void)
        VMCOREINFO_SYMBOL(init_uts_ns);
        VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
        VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
        VMCOREINFO_SYMBOL(_stext);
        VMCOREINFO_SYMBOL(vmlist);
@@ -1514,7 +1483,6 @@ static int __init crash_save_vmcoreinfo_init(void)
        VMCOREINFO_NUMBER(PG_swapcache);
        arch_crash_save_vmcoreinfo();
-        update_vmcoreinfo_note();
        return 0;
 }
@@ -1538,7 +1506,7 @@ int kernel_kexec(void)
 #ifdef CONFIG_KEXEC_JUMP
        if (kexec_image->preserve_context) {
-                lock_system_sleep();
+                mutex_lock(&pm_mutex);
                pm_prepare_console();
                error = freeze_processes();
                if (error) {
@@ -1550,13 +1518,13 @@ int kernel_kexec(void)
                if (error)
                        goto Resume_console;
                /* At this point, dpm_suspend_start() has been called,
-                 * but *not* dpm_suspend_end(). We *must* call
+                 * but *not* dpm_suspend_noirq(). We *must* call
-                 * dpm_suspend_end() now.  Otherwise, drivers for
+                 * dpm_suspend_noirq() now.  Otherwise, drivers for
                 * some devices (e.g. interrupt controllers) become
                 * desynchronized with the actual state of the
                 * hardware at resume time, and evil weirdness ensues.
                 */
-                error = dpm_suspend_end(PMSG_FREEZE);
+                error = dpm_suspend_noirq(PMSG_FREEZE);
                if (error)
                        goto Resume_devices;
                error = disable_nonboot_cpus();
@@ -1583,7 +1551,7 @@ int kernel_kexec(void)
                local_irq_enable();
 Enable_cpus:
                enable_nonboot_cpus();
-                dpm_resume_start(PMSG_RESTORE);
+                dpm_resume_noirq(PMSG_RESTORE);
 Resume_devices:
                dpm_resume_end(PMSG_RESTORE);
 Resume_console:
@@ -1591,7 +1559,7 @@ int kernel_kexec(void)
                thaw_processes();
 Restore_console:
                pm_restore_console();
-                unlock_system_sleep();
+                mutex_unlock(&pm_mutex);
        }
 #endif
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 59dcf5b81d2..01a0700e873 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -20,7 +20,7 @@
 */
 #include <linux/kernel.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/log2.h>
@@ -402,7 +402,6 @@ unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
                return max;
        return len;
 }
-EXPORT_SYMBOL(__kfifo_max_r);
 #define __KFIFO_PEEK(data, out, mask) \
        ((data)[(out) & (mask)])
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0023a87e8de..a4bea97c75b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,8 +36,6 @@
 #include <linux/resource.h>
 #include <linux/notifier.h>
 #include <linux/suspend.h>
-#include <linux/rwsem.h>
-#include <linux/ptrace.h>
 #include <asm/uaccess.h>
 #include <trace/events/module.h>
@@ -46,20 +44,12 @@ extern int max_threads;
 static struct workqueue_struct *khelper_wq;
-/*
- * kmod_thread_locker is used for deadlock avoidance.  There is no explicit
- * locking to protect this global - it is private to the singleton khelper
- * thread and should only ever be modified by that thread.
- */
-static const struct task_struct *kmod_thread_locker;
 #define CAP_BSET        (void *)1
 #define CAP_PI          (void *)2
 static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
 static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
 static DEFINE_SPINLOCK(umh_sysctl_lock);
-static DECLARE_RWSEM(umhelper_sem);
 #ifdef CONFIG_MODULES
@@ -68,43 +58,6 @@ static DECLARE_RWSEM(umhelper_sem);
 */
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
-static void free_modprobe_argv(struct subprocess_info *info)
-{
-        kfree(info->argv[3]); /* check call_modprobe() */
-        kfree(info->argv);
-}
-static int call_modprobe(char *module_name, int wait)
-{
-        static char *envp[] = {
-                "HOME=/",
-                "TERM=linux",
-                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
-                NULL
-        };
-        char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
-        if (!argv)
-                goto out;
-        module_name = kstrdup(module_name, GFP_KERNEL);
-        if (!module_name)
-                goto free_argv;
-        argv[0] = modprobe_path;
-        argv[1] = "-q";
-        argv[2] = "--";
-        argv[3] = module_name;  /* check free_modprobe_argv() */
-        argv[4] = NULL;
-        return call_usermodehelper_fns(modprobe_path, argv, envp,
-                wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
-free_argv:
-        kfree(argv);
-out:
-        return -ENOMEM;
-}
 /**
 * __request_module - try to load a kernel module
 * @wait: wait (or not) for the operation to complete
@@ -126,6 +79,11 @@ int __request_module(bool wait, const char *fmt, ...)
        char module_name[MODULE_NAME_LEN];
        unsigned int max_modprobes;
        int ret;
+        char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
+        static char *envp[] = { "HOME=/",
+                                "TERM=linux",
+                                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+                                NULL };
        static atomic_t kmod_concurrent = ATOMIC_INIT(0);
 #define MAX_KMOD_CONCURRENT 50  /* Completely arbitrary value - KAO */
        static int kmod_loop_msg;
@@ -168,7 +126,9 @@ int __request_module(bool wait, const char *fmt, ...)
        trace_module_request(module_name, wait, _RET_IP_);
-        ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
+        ret = call_usermodehelper_fns(modprobe_path, argv, envp,
+                        wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
+                        NULL, NULL, NULL);
        atomic_dec(&kmod_concurrent);
        return ret;
@@ -219,11 +179,9 @@ static int ____call_usermodehelper(void *data)
        commit_creds(new);
-        retval = do_execve(sub_info->path,
+        retval = kernel_execve(sub_info->path,
-                           (const char __user *const __user *)sub_info->argv,
+                               (const char *const *)sub_info->argv,
-                           (const char __user *const __user *)sub_info->envp);
+                               (const char *const *)sub_info->envp);
-        if (!retval)
-                return 0;
        /* Exec failed? */
 fail:
@@ -231,32 +189,13 @@ fail:
        do_exit(0);
 }
-static int call_helper(void *data)
+void call_usermodehelper_freeinfo(struct subprocess_info *info)
-{
-        /* Worker thread started blocking khelper thread. */
-        kmod_thread_locker = current;
-        return ____call_usermodehelper(data);
-}
-static void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
        if (info->cleanup)
                (*info->cleanup)(info);
        kfree(info);
 }
+EXPORT_SYMBOL(call_usermodehelper_freeinfo);
-static void umh_complete(struct subprocess_info *sub_info)
-{
-        struct completion *comp = xchg(&sub_info->complete, NULL);
-        /*
-         * See call_usermodehelper_exec(). If xchg() returns NULL
-         * we own sub_info, the UMH_KILLABLE caller has gone away.
-         */
-        if (comp)
-                complete(comp);
-        else
-                call_usermodehelper_freeinfo(sub_info);
-}
 /* Keventd can't block, but this (a child) can. */
 static int wait_for_helper(void *data)
@@ -294,8 +233,8 @@ static int wait_for_helper(void *data)
                        sub_info->retval = ret;
        }
-        umh_complete(sub_info);
+        complete(sub_info->complete);
-        do_exit(0);
+        return 0;
 }
 /* This is run by khelper thread  */
@@ -303,7 +242,7 @@ static void __call_usermodehelper(struct work_struct *work)
 {
        struct subprocess_info *sub_info =
                container_of(work, struct subprocess_info, work);
-        int wait = sub_info->wait & ~UMH_KILLABLE;
+        enum umh_wait wait = sub_info->wait;
        pid_t pid;
        /* CLONE_VFORK: wait until the usermode helper has execve'd
@@ -312,12 +251,9 @@ static void __call_usermodehelper(struct work_struct *work)
        if (wait == UMH_WAIT_PROC)
                pid = kernel_thread(wait_for_helper, sub_info,
                                    CLONE_FS | CLONE_FILES | SIGCHLD);
-        else {
+        else
-                pid = kernel_thread(call_helper, sub_info,
+                pid = kernel_thread(____call_usermodehelper, sub_info,
                                    CLONE_VFORK | SIGCHLD);
-                /* Worker thread stopped blocking khelper thread. */
-                kmod_thread_locker = NULL;
-        }
        switch (wait) {
        case UMH_NO_WAIT:
@@ -331,7 +267,7 @@ static void __call_usermodehelper(struct work_struct *work)
        case UMH_WAIT_EXEC:
                if (pid < 0)
                        sub_info->retval = pid;
-                umh_complete(sub_info);
+                complete(sub_info->complete);
        }
 }
@@ -339,126 +275,33 @@ static void __call_usermodehelper(struct work_struct *work)
 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
 * (used for preventing user land processes from being created after the user
 * land has been frozen during a system-wide hibernation or suspend operation).
- * Should always be manipulated under umhelper_sem acquired for write.
 */
-static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
+static int usermodehelper_disabled = 1;
 /* Number of helpers running */
 static atomic_t running_helpers = ATOMIC_INIT(0);
 /*
- * Wait queue head used by usermodehelper_disable() to wait for all running
+ * Wait queue head used by usermodehelper_pm_callback() to wait for all running
 * helpers to finish.
 */
 static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
 /*
- * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
- * to become 'false'.
- */
-static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
-/*
 * Time to wait for running_helpers to become zero before the setting of
- * usermodehelper_disabled in usermodehelper_disable() fails
+ * usermodehelper_disabled in usermodehelper_pm_callback() fails
 */
 #define RUNNING_HELPERS_TIMEOUT (5 * HZ)
-int usermodehelper_read_trylock(void)
-{
-        DEFINE_WAIT(wait);
-        int ret = 0;
-        down_read(&umhelper_sem);
-        for (;;) {
-                prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
-                                TASK_INTERRUPTIBLE);
-                if (!usermodehelper_disabled)
-                        break;
-                if (usermodehelper_disabled == UMH_DISABLED)
-                        ret = -EAGAIN;
-                up_read(&umhelper_sem);
-                if (ret)
-                        break;
-                schedule();
-                try_to_freeze();
-                down_read(&umhelper_sem);
-        }
-        finish_wait(&usermodehelper_disabled_waitq, &wait);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
-long usermodehelper_read_lock_wait(long timeout)
-{
-        DEFINE_WAIT(wait);
-        if (timeout < 0)
-                return -EINVAL;
-        down_read(&umhelper_sem);
-        for (;;) {
-                prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
-                                TASK_UNINTERRUPTIBLE);
-                if (!usermodehelper_disabled)
-                        break;
-                up_read(&umhelper_sem);
-                timeout = schedule_timeout(timeout);
-                if (!timeout)
-                        break;
-                down_read(&umhelper_sem);
-        }
-        finish_wait(&usermodehelper_disabled_waitq, &wait);
-        return timeout;
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
-void usermodehelper_read_unlock(void)
-{
-        up_read(&umhelper_sem);
-}
-EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
 /**
- * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
+ * usermodehelper_disable - prevent new helpers from being started
- * @depth: New value to assign to usermodehelper_disabled.
- *
- * Change the value of usermodehelper_disabled (under umhelper_sem locked for
- * writing) and wakeup tasks waiting for it to change.
 */
-void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
+int usermodehelper_disable(void)
-{
-        down_write(&umhelper_sem);
-        usermodehelper_disabled = depth;
-        wake_up(&usermodehelper_disabled_waitq);
-        up_write(&umhelper_sem);
-}
-/**
- * __usermodehelper_disable - Prevent new helpers from being started.
- * @depth: New value to assign to usermodehelper_disabled.
- *
- * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
- */
-int __usermodehelper_disable(enum umh_disable_depth depth)
 {
        long retval;
-        if (!depth)
+        usermodehelper_disabled = 1;
-                return -EINVAL;
+        smp_mb();
-        down_write(&umhelper_sem);
-        usermodehelper_disabled = depth;
-        up_write(&umhelper_sem);
        /*
         * From now on call_usermodehelper_exec() won't start any new
         * helpers, so it is sufficient if running_helpers turns out to
@@ -471,10 +314,27 @@ int __usermodehelper_disable(enum umh_disable_depth depth)
        if (retval)
                return 0;
-        __usermodehelper_set_disable_depth(UMH_ENABLED);
+        usermodehelper_disabled = 0;
        return -EAGAIN;
 }
+/**
+ * usermodehelper_enable - allow new helpers to be started again
+ */
+void usermodehelper_enable(void)
+{
+        usermodehelper_disabled = 0;
+}
+/**
+ * usermodehelper_is_disabled - check if new helpers are allowed to be started
+ */
+bool usermodehelper_is_disabled(void)
+{
+        return usermodehelper_disabled;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
 static void helper_lock(void)
 {
        atomic_inc(&running_helpers);
@@ -498,7 +358,6 @@ static void helper_unlock(void)
 * structure.  This should be passed to call_usermodehelper_exec to
 * exec the process and free the structure.
 */
-static
 struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
                                                  char **envp, gfp_t gfp_mask)
 {
@@ -514,6 +373,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
  out:
        return sub_info;
 }
+EXPORT_SYMBOL(call_usermodehelper_setup);
 /**
 * call_usermodehelper_setfns - set a cleanup/init function
@@ -531,7 +391,6 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
 * Function must be runnable in either a process context or the
 * context in which call_usermodehelper_exec is called.
 */
-static
 void call_usermodehelper_setfns(struct subprocess_info *info,
                    int (*init)(struct subprocess_info *info, struct cred *new),
                    void (*cleanup)(struct subprocess_info *info),
@@ -541,6 +400,7 @@ void call_usermodehelper_setfns(struct subprocess_info *info,
        info->init = init;
        info->data = data;
 }
+EXPORT_SYMBOL(call_usermodehelper_setfns);
 /**
 * call_usermodehelper_exec - start a usermode application
@@ -554,8 +414,8 @@ void call_usermodehelper_setfns(struct subprocess_info *info,
 * asynchronously if wait is not set, and runs as a child of keventd.
 * (ie. it runs with full root capabilities).
 */
-static
+int call_usermodehelper_exec(struct subprocess_info *sub_info,
-int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
+                             enum umh_wait wait)
 {
        DECLARE_COMPLETION_ONSTACK(done);
        int retval = 0;
@@ -568,16 +428,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
                retval = -EBUSY;
                goto out;
        }
-        /*
-         * Worker thread must not wait for khelper thread at below
-         * wait_for_completion() if the thread was created with CLONE_VFORK
-         * flag, for khelper thread is already waiting for the thread at
-         * wait_for_completion() in do_fork().
-         */
-        if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
-                retval = -EBUSY;
-                goto out;
-        }
        sub_info->complete = &done;
        sub_info->wait = wait;
@@ -585,52 +435,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
        queue_work(khelper_wq, &sub_info->work);
        if (wait == UMH_NO_WAIT)        /* task has freed sub_info */
                goto unlock;
-        if (wait & UMH_KILLABLE) {
-                retval = wait_for_completion_killable(&done);
-                if (!retval)
-                        goto wait_done;
-                /* umh_complete() will see NULL and free sub_info */
-                if (xchg(&sub_info->complete, NULL))
-                        goto unlock;
-                /* fallthrough, umh_complete() was already called */
-        }
        wait_for_completion(&done);
-wait_done:
        retval = sub_info->retval;
 out:
        call_usermodehelper_freeinfo(sub_info);
 unlock:
        helper_unlock();
        return retval;
 }
+EXPORT_SYMBOL(call_usermodehelper_exec);
-/*
- * call_usermodehelper_fns() will not run the caller-provided cleanup function
- * if a memory allocation failure is experienced.  So the caller might need to
- * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
- * the necessaary cleanup within the caller.
- */
-int call_usermodehelper_fns(
-        char *path, char **argv, char **envp, int wait,
-        int (*init)(struct subprocess_info *info, struct cred *new),
-        void (*cleanup)(struct subprocess_info *), void *data)
-{
-        struct subprocess_info *info;
-        gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
-        info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
-        if (info == NULL)
-                return -ENOMEM;
-        call_usermodehelper_setfns(info, init, cleanup, data);
-        return call_usermodehelper_exec(info, wait);
-}
-EXPORT_SYMBOL(call_usermodehelper_fns);
 static int proc_cap_handler(struct ctl_table *table, int write,
                         void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 098f396aa40..b30fd54eb98 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -36,7 +36,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/stddef.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
 #include <linux/freezer.h>
@@ -78,10 +78,10 @@ static bool kprobes_all_disarmed;
 static DEFINE_MUTEX(kprobe_mutex);
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 static struct {
-        raw_spinlock_t lock ____cacheline_aligned_in_smp;
+        spinlock_t lock ____cacheline_aligned_in_smp;
 } kretprobe_table_locks[KPROBE_TABLE_SIZE];
-static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 {
        return &(kretprobe_table_locks[hash].lock);
 }
@@ -561,9 +561,9 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
 {
        LIST_HEAD(free_list);
-        mutex_lock(&kprobe_mutex);
        /* Lock modules while optimizing kprobes */
        mutex_lock(&module_mutex);
+        mutex_lock(&kprobe_mutex);
        /*
         * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
@@ -586,8 +586,8 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
        /* Step 4: Free cleaned kprobes after quiesence period */
        do_free_cleaned_kprobes(&free_list);
-        mutex_unlock(&module_mutex);
        mutex_unlock(&kprobe_mutex);
+        mutex_unlock(&module_mutex);
        /* Step 5: Kick optimizer again if needed */
        if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
@@ -759,32 +759,20 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
        struct kprobe *ap;
        struct optimized_kprobe *op;
-        /* Impossible to optimize ftrace-based kprobe */
-        if (kprobe_ftrace(p))
-                return;
-        /* For preparing optimization, jump_label_text_reserved() is called */
-        jump_label_lock();
-        mutex_lock(&text_mutex);
        ap = alloc_aggr_kprobe(p);
        if (!ap)
-                goto out;
+                return;
        op = container_of(ap, struct optimized_kprobe, kp);
        if (!arch_prepared_optinsn(&op->optinsn)) {
                /* If failed to setup optimizing, fallback to kprobe */
                arch_remove_optimized_kprobe(op);
                kfree(op);
-                goto out;
+                return;
        }
        init_aggr_kprobe(ap, p);
-        optimize_kprobe(ap);    /* This just kicks optimizer thread */
+        optimize_kprobe(ap);
-out:
-        mutex_unlock(&text_mutex);
-        jump_label_unlock();
 }
 #ifdef CONFIG_SYSCTL
@@ -919,64 +907,9 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 }
 #endif /* CONFIG_OPTPROBES */
-#ifdef KPROBES_CAN_USE_FTRACE
-static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
-        .func = kprobe_ftrace_handler,
-        .flags = FTRACE_OPS_FL_SAVE_REGS,
-};
-static int kprobe_ftrace_enabled;
-/* Must ensure p->addr is really on ftrace */
-static int __kprobes prepare_kprobe(struct kprobe *p)
-{
-        if (!kprobe_ftrace(p))
-                return arch_prepare_kprobe(p);
-        return arch_prepare_kprobe_ftrace(p);
-}
-/* Caller must lock kprobe_mutex */
-static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
-{
-        int ret;
-        ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
-                                   (unsigned long)p->addr, 0, 0);
-        WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret);
-        kprobe_ftrace_enabled++;
-        if (kprobe_ftrace_enabled == 1) {
-                ret = register_ftrace_function(&kprobe_ftrace_ops);
-                WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
-        }
-}
-/* Caller must lock kprobe_mutex */
-static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
-{
-        int ret;
-        kprobe_ftrace_enabled--;
-        if (kprobe_ftrace_enabled == 0) {
-                ret = unregister_ftrace_function(&kprobe_ftrace_ops);
-                WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
-        }
-        ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
-                           (unsigned long)p->addr, 1, 0);
-        WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
-}
-#else   /* !KPROBES_CAN_USE_FTRACE */
-#define prepare_kprobe(p)       arch_prepare_kprobe(p)
-#define arm_kprobe_ftrace(p)    do {} while (0)
-#define disarm_kprobe_ftrace(p) do {} while (0)
-#endif
 /* Arm a kprobe with text_mutex */
 static void __kprobes arm_kprobe(struct kprobe *kp)
 {
-        if (unlikely(kprobe_ftrace(kp))) {
-                arm_kprobe_ftrace(kp);
-                return;
-        }
        /*
         * Here, since __arm_kprobe() doesn't use stop_machine(),
         * this doesn't cause deadlock on text_mutex. So, we don't
@@ -988,15 +921,11 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
 }
 /* Disarm a kprobe with text_mutex */
-static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
+static void __kprobes disarm_kprobe(struct kprobe *kp)
 {
-        if (unlikely(kprobe_ftrace(kp))) {
-                disarm_kprobe_ftrace(kp);
-                return;
-        }
        /* Ditto */
        mutex_lock(&text_mutex);
-        __disarm_kprobe(kp, reopt);
+        __disarm_kprobe(kp, true);
        mutex_unlock(&text_mutex);
 }
@@ -1084,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
        hlist_del(&ri->hlist);
        INIT_HLIST_NODE(&ri->hlist);
        if (likely(rp)) {
-                raw_spin_lock(&rp->lock);
+                spin_lock(&rp->lock);
                hlist_add_head(&ri->hlist, &rp->free_instances);
-                raw_spin_unlock(&rp->lock);
+                spin_unlock(&rp->lock);
        } else
                /* Unregistering */
                hlist_add_head(&ri->hlist, head);
@@ -1097,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
 __acquires(hlist_lock)
 {
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
-        raw_spinlock_t *hlist_lock;
+        spinlock_t *hlist_lock;
        *head = &kretprobe_inst_table[hash];
        hlist_lock = kretprobe_table_lock_ptr(hash);
-        raw_spin_lock_irqsave(hlist_lock, *flags);
+        spin_lock_irqsave(hlist_lock, *flags);
 }
 static void __kprobes kretprobe_table_lock(unsigned long hash,
        unsigned long *flags)
 __acquires(hlist_lock)
 {
-        raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
-        raw_spin_lock_irqsave(hlist_lock, *flags);
+        spin_lock_irqsave(hlist_lock, *flags);
 }
 void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
@@ -1117,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
 __releases(hlist_lock)
 {
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
-        raw_spinlock_t *hlist_lock;
+        spinlock_t *hlist_lock;
        hlist_lock = kretprobe_table_lock_ptr(hash);
-        raw_spin_unlock_irqrestore(hlist_lock, *flags);
+        spin_unlock_irqrestore(hlist_lock, *flags);
 }
 static void __kprobes kretprobe_table_unlock(unsigned long hash,
       unsigned long *flags)
 __releases(hlist_lock)
 {
-        raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
-        raw_spin_unlock_irqrestore(hlist_lock, *flags);
+        spin_unlock_irqrestore(hlist_lock, *flags);
 }
 /*
@@ -1148,7 +1077,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
                /* Early boot.  kretprobe_table_locks not yet initialized. */
                return;
-        INIT_HLIST_HEAD(&empty_rp);
        hash = hash_ptr(tk, KPROBE_HASH_BITS);
        head = &kretprobe_inst_table[hash];
        kretprobe_table_lock(hash, &flags);
@@ -1157,6 +1085,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
                        recycle_rp_inst(ri, &empty_rp);
        }
        kretprobe_table_unlock(hash, &flags);
+        INIT_HLIST_HEAD(&empty_rp);
        hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
                hlist_del(&ri->hlist);
                kfree(ri);
@@ -1215,6 +1144,12 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
        if (p->post_handler && !ap->post_handler)
                ap->post_handler = aggr_post_handler;
+        if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
+                ap->flags &= ~KPROBE_FLAG_DISABLED;
+                if (!kprobes_all_disarmed)
+                        /* Arm the breakpoint again. */
+                        __arm_kprobe(ap);
+        }
        return 0;
 }
@@ -1254,22 +1189,11 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
        int ret = 0;
        struct kprobe *ap = orig_p;
-        /* For preparing optimization, jump_label_text_reserved() is called */
-        jump_label_lock();
-        /*
-         * Get online CPUs to avoid text_mutex deadlock.with stop machine,
-         * which is invoked by unoptimize_kprobe() in add_new_kprobe()
-         */
-        get_online_cpus();
-        mutex_lock(&text_mutex);
        if (!kprobe_aggrprobe(orig_p)) {
                /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
                ap = alloc_aggr_kprobe(orig_p);
-                if (!ap) {
+                if (!ap)
-                        ret = -ENOMEM;
+                        return -ENOMEM;
-                        goto out;
-                }
                init_aggr_kprobe(ap, orig_p);
        } else if (kprobe_unused(ap))
                /* This probe is going to die. Rescue it */
@@ -1289,7 +1213,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
                         * free aggr_probe. It will be used next time, or
                         * freed by unregister_kprobe.
                         */
-                        goto out;
+                        return ret;
                /* Prepare optimized instructions if possible. */
                prepare_optimized_kprobe(ap);
@@ -1304,20 +1228,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
        /* Copy ap's insn slot to p */
        copy_kprobe(ap, p);
-        ret = add_new_kprobe(ap, p);
+        return add_new_kprobe(ap, p);
-out:
-        mutex_unlock(&text_mutex);
-        put_online_cpus();
-        jump_label_unlock();
-        if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
-                ap->flags &= ~KPROBE_FLAG_DISABLED;
-                if (!kprobes_all_disarmed)
-                        /* Arm the breakpoint again. */
-                        arm_kprobe(ap);
-        }
-        return ret;
 }
 static int __kprobes in_kprobes_functions(unsigned long addr)
@@ -1402,96 +1313,69 @@ static inline int check_kprobe_rereg(struct kprobe *p)
        return ret;
 }
-static __kprobes int check_kprobe_address_safe(struct kprobe *p,
+int __kprobes register_kprobe(struct kprobe *p)
-                                               struct module **probed_mod)
 {
        int ret = 0;
-        unsigned long ftrace_addr;
+        struct kprobe *old_p;
+        struct module *probed_mod;
+        kprobe_opcode_t *addr;
-        /*
+        addr = kprobe_addr(p);
-         * If the address is located on a ftrace nop, set the
+        if (IS_ERR(addr))
-         * breakpoint to the following instruction.
+                return PTR_ERR(addr);
-         */
+        p->addr = addr;
-        ftrace_addr = ftrace_location((unsigned long)p->addr);
-        if (ftrace_addr) {
+        ret = check_kprobe_rereg(p);
-#ifdef KPROBES_CAN_USE_FTRACE
+        if (ret)
-                /* Given address is not on the instruction boundary */
+                return ret;
-                if ((unsigned long)p->addr != ftrace_addr)
-                        return -EILSEQ;
-                p->flags |= KPROBE_FLAG_FTRACE;
-#else   /* !KPROBES_CAN_USE_FTRACE */
-                return -EINVAL;
-#endif
-        }
        jump_label_lock();
        preempt_disable();
-        /* Ensure it is not in reserved area nor out of text */
        if (!kernel_text_address((unsigned long) p->addr) ||
            in_kprobes_functions((unsigned long) p->addr) ||
-            jump_label_text_reserved(p->addr, p->addr)) {
+            ftrace_text_reserved(p->addr, p->addr) ||
-                ret = -EINVAL;
+            jump_label_text_reserved(p->addr, p->addr))
-                goto out;
+                goto fail_with_jump_label;
-        }
+        /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
+        p->flags &= KPROBE_FLAG_DISABLED;
-        /* Check if are we probing a module */
+        /*
-        *probed_mod = __module_text_address((unsigned long) p->addr);
+         * Check if are we probing a module.
-        if (*probed_mod) {
+         */
+        probed_mod = __module_text_address((unsigned long) p->addr);
+        if (probed_mod) {
+                /* Return -ENOENT if fail. */
+                ret = -ENOENT;
                /*
                 * We must hold a refcount of the probed module while updating
                 * its code to prohibit unexpected unloading.
                 */
-                if (unlikely(!try_module_get(*probed_mod))) {
+                if (unlikely(!try_module_get(probed_mod)))
-                        ret = -ENOENT;
+                        goto fail_with_jump_label;
-                        goto out;
-                }
                /*
                 * If the module freed .init.text, we couldn't insert
                 * kprobes in there.
                 */
-                if (within_module_init((unsigned long)p->addr, *probed_mod) &&
+                if (within_module_init((unsigned long)p->addr, probed_mod) &&
-                    (*probed_mod)->state != MODULE_STATE_COMING) {
+                    probed_mod->state != MODULE_STATE_COMING) {
-                        module_put(*probed_mod);
+                        module_put(probed_mod);
-                        *probed_mod = NULL;
+                        goto fail_with_jump_label;
-                        ret = -ENOENT;
                }
+                /* ret will be updated by following code */
        }
-out:
        preempt_enable();
        jump_label_unlock();
-        return ret;
-}
-int __kprobes register_kprobe(struct kprobe *p)
-{
-        int ret;
-        struct kprobe *old_p;
-        struct module *probed_mod;
-        kprobe_opcode_t *addr;
-        /* Adjust probe address from symbol */
-        addr = kprobe_addr(p);
-        if (IS_ERR(addr))
-                return PTR_ERR(addr);
-        p->addr = addr;
-        ret = check_kprobe_rereg(p);
-        if (ret)
-                return ret;
-        /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
-        p->flags &= KPROBE_FLAG_DISABLED;
        p->nmissed = 0;
        INIT_LIST_HEAD(&p->list);
+        mutex_lock(&kprobe_mutex);
-        ret = check_kprobe_address_safe(p, &probed_mod);
+        jump_label_lock(); /* needed to call jump_label_text_reserved() */
-        if (ret)
-                return ret;
-        mutex_lock(&kprobe_mutex);
+        get_online_cpus();      /* For avoiding text_mutex deadlock. */
+        mutex_lock(&text_mutex);
        old_p = get_kprobe(p->addr);
        if (old_p) {
@@ -1500,9 +1384,7 @@ int __kprobes register_kprobe(struct kprobe *p)
                goto out;
        }
-        mutex_lock(&text_mutex);        /* Avoiding text modification */
+        ret = arch_prepare_kprobe(p);
-        ret = prepare_kprobe(p);
-        mutex_unlock(&text_mutex);
        if (ret)
                goto out;
@@ -1511,18 +1393,26 @@ int __kprobes register_kprobe(struct kprobe *p)
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
        if (!kprobes_all_disarmed && !kprobe_disabled(p))
-                arm_kprobe(p);
+                __arm_kprobe(p);
        /* Try to optimize kprobe */
        try_to_optimize_kprobe(p);
 out:
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
+        jump_label_unlock();
        mutex_unlock(&kprobe_mutex);
        if (probed_mod)
                module_put(probed_mod);
        return ret;
+fail_with_jump_label:
+        preempt_enable();
+        jump_label_unlock();
+        return ret;
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
@@ -1559,7 +1449,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
                /* Try to disarm and disable this/parent probe */
                if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
-                        disarm_kprobe(orig_p, true);
+                        disarm_kprobe(orig_p);
                        orig_p->flags |= KPROBE_FLAG_DISABLED;
                }
        }
@@ -1773,22 +1663,18 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
        /*TODO: consider to only swap the RA after the last pre_handler fired */
        hash = hash_ptr(current, KPROBE_HASH_BITS);
-        raw_spin_lock_irqsave(&rp->lock, flags);
+        spin_lock_irqsave(&rp->lock, flags);
        if (!hlist_empty(&rp->free_instances)) {
                ri = hlist_entry(rp->free_instances.first,
                                struct kretprobe_instance, hlist);
                hlist_del(&ri->hlist);
-                raw_spin_unlock_irqrestore(&rp->lock, flags);
+                spin_unlock_irqrestore(&rp->lock, flags);
                ri->rp = rp;
                ri->task = current;
-                if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+                if (rp->entry_handler && rp->entry_handler(ri, regs))
-                        raw_spin_lock_irqsave(&rp->lock, flags);
-                        hlist_add_head(&ri->hlist, &rp->free_instances);
-                        raw_spin_unlock_irqrestore(&rp->lock, flags);
                        return 0;
-                }
                arch_prepare_kretprobe(ri, regs);
@@ -1799,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
                kretprobe_table_unlock(hash, &flags);
        } else {
                rp->nmissed++;
-                raw_spin_unlock_irqrestore(&rp->lock, flags);
+                spin_unlock_irqrestore(&rp->lock, flags);
        }
        return 0;
 }
@@ -1835,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
                rp->maxactive = num_possible_cpus();
 #endif
        }
-        raw_spin_lock_init(&rp->lock);
+        spin_lock_init(&rp->lock);
        INIT_HLIST_HEAD(&rp->free_instances);
        for (i = 0; i < rp->maxactive; i++) {
                inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -2073,7 +1959,7 @@ static int __init init_kprobes(void)
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                INIT_HLIST_HEAD(&kprobe_table[i]);
                INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
-                raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
+                spin_lock_init(&(kretprobe_table_locks[i].lock));
        }
        /*
@@ -2157,11 +2043,10 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
        if (!pp)
                pp = p;
-        seq_printf(pi, "%s%s%s%s\n",
+        seq_printf(pi, "%s%s%s\n",
                (kprobe_gone(p) ? "[GONE]" : ""),
                ((kprobe_disabled(p) && !kprobe_gone(p)) ?  "[DISABLED]" : ""),
-                (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""),
+                (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
-                (kprobe_ftrace(pp) ? "[FTRACE]" : ""));
 }
 static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -2240,12 +2125,14 @@ static void __kprobes arm_all_kprobes(void)
                goto already_enabled;
        /* Arming kprobes doesn't optimize kprobe itself */
+        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist)
                        if (!kprobe_disabled(p))
-                                arm_kprobe(p);
+                                __arm_kprobe(p);
        }
+        mutex_unlock(&text_mutex);
        kprobes_all_disarmed = false;
        printk(KERN_INFO "Kprobes globally enabled\n");
@@ -2273,13 +2160,15 @@ static void __kprobes disarm_all_kprobes(void)
        kprobes_all_disarmed = true;
        printk(KERN_INFO "Kprobes globally disabled\n");
+        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
-                                disarm_kprobe(p, false);
+                                __disarm_kprobe(p, false);
                }
        }
+        mutex_unlock(&text_mutex);
        mutex_unlock(&kprobe_mutex);
        /* Wait for disarming all kprobes by optimizer */
@@ -2309,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
               const char __user *user_buf, size_t count, loff_t *ppos)
 {
        char buf[32];
-        size_t buf_size;
+        int buf_size;
        buf_size = min(count, (sizeof(buf)-1));
        if (copy_from_user(buf, user_buf, buf_size))
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6ada93c23a9..3b053c04dd8 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -11,11 +11,10 @@
 #include <linux/kobject.h>
 #include <linux/string.h>
 #include <linux/sysfs.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kexec.h>
 #include <linux/profile.h>
-#include <linux/stat.h>
 #include <linux/sched.h>
 #include <linux/capability.h>
@@ -26,6 +25,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 static struct kobj_attribute _name##_attr = \
        __ATTR(_name, 0644, _name##_show, _name##_store)
+#if defined(CONFIG_HOTPLUG)
 /* current uevent sequence number */
 static ssize_t uevent_seqnum_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
@@ -53,7 +53,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
        return count;
 }
 KERNEL_ATTR_RW(uevent_helper);
+#endif
 #ifdef CONFIG_PROFILING
 static ssize_t profiling_show(struct kobject *kobj,
@@ -140,23 +140,6 @@ static ssize_t fscaps_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(fscaps);
-int rcu_expedited;
-static ssize_t rcu_expedited_show(struct kobject *kobj,
-                                  struct kobj_attribute *attr, char *buf)
-{
-        return sprintf(buf, "%d\n", rcu_expedited);
-}
-static ssize_t rcu_expedited_store(struct kobject *kobj,
-                                   struct kobj_attribute *attr,
-                                   const char *buf, size_t count)
-{
-        if (kstrtoint(buf, 0, &rcu_expedited))
-                return -EINVAL;
-        return count;
-}
-KERNEL_ATTR_RW(rcu_expedited);
 /*
 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
 */
@@ -185,8 +168,10 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
 static struct attribute * kernel_attrs[] = {
        &fscaps_attr.attr,
+#if defined(CONFIG_HOTPLUG)
        &uevent_seqnum_attr.attr,
        &uevent_helper_attr.attr,
+#endif
 #ifdef CONFIG_PROFILING
        &profiling_attr.attr,
 #endif
@@ -196,7 +181,6 @@ static struct attribute * kernel_attrs[] = {
        &kexec_crash_size_attr.attr,
        &vmcoreinfo_attr.attr,
 #endif
-        &rcu_expedited_attr.attr,
        NULL
 };
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2ef9ba..4ba7cccb499 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,11 +12,10 @@
 #include <linux/cpuset.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/freezer.h>
-#include <linux/ptrace.h>
 #include <trace/events/sched.h>
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -38,20 +37,11 @@ struct kthread_create_info
 };
 struct kthread {
-        unsigned long flags;
+        int should_stop;
-        unsigned int cpu;
        void *data;
-        struct completion parked;
        struct completion exited;
 };
-enum KTHREAD_BITS {
-        KTHREAD_IS_PER_CPU = 0,
-        KTHREAD_SHOULD_STOP,
-        KTHREAD_SHOULD_PARK,
-        KTHREAD_IS_PARKED,
-};
 #define to_kthread(tsk) \
        container_of((tsk)->vfork_done, struct kthread, exited)
@@ -62,54 +52,13 @@ enum KTHREAD_BITS {
 * and this will return true.  You should then return, and your return
 * value will be passed through to kthread_stop().
 */
-bool kthread_should_stop(void)
+int kthread_should_stop(void)
 {
-        return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
+        return to_kthread(current)->should_stop;
 }
 EXPORT_SYMBOL(kthread_should_stop);
 /**
- * kthread_should_park - should this kthread park now?
- *
- * When someone calls kthread_park() on your kthread, it will be woken
- * and this will return true.  You should then do the necessary
- * cleanup and call kthread_parkme()
- *
- * Similar to kthread_should_stop(), but this keeps the thread alive
- * and in a park position. kthread_unpark() "restarts" the thread and
- * calls the thread function again.
- */
-bool kthread_should_park(void)
-{
-        return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
-}
-/**
- * kthread_freezable_should_stop - should this freezable kthread return now?
- * @was_frozen: optional out parameter, indicates whether %current was frozen
- *
- * kthread_should_stop() for freezable kthreads, which will enter
- * refrigerator if necessary.  This function is safe from kthread_stop() /
- * freezer deadlock and freezable kthreads should use this function instead
- * of calling try_to_freeze() directly.
- */
-bool kthread_freezable_should_stop(bool *was_frozen)
-{
-        bool frozen = false;
-        might_sleep();
-        if (unlikely(freezing(current)))
-                frozen = __refrigerator(true);
-        if (was_frozen)
-                *was_frozen = frozen;
-        return kthread_should_stop();
-}
-EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
-/**
 * kthread_data - return data value specified on kthread creation
 * @task: kthread task in question
 *
@@ -122,24 +71,6 @@ void *kthread_data(struct task_struct *task)
        return to_kthread(task)->data;
 }
-static void __kthread_parkme(struct kthread *self)
-{
-        __set_current_state(TASK_INTERRUPTIBLE);
-        while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
-                if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
-                        complete(&self->parked);
-                schedule();
-                __set_current_state(TASK_INTERRUPTIBLE);
-        }
-        clear_bit(KTHREAD_IS_PARKED, &self->flags);
-        __set_current_state(TASK_RUNNING);
-}
-void kthread_parkme(void)
-{
-        __kthread_parkme(to_kthread(current));
-}
 static int kthread(void *_create)
 {
        /* Copy data: it's on kthread's stack */
@@ -149,10 +80,9 @@ static int kthread(void *_create)
        struct kthread self;
        int ret;
-        self.flags = 0;
+        self.should_stop = 0;
        self.data = data;
        init_completion(&self.exited);
-        init_completion(&self.parked);
        current->vfork_done = &self.exited;
        /* OK, tell user we're spawned, wait for stop or wakeup */
@@ -162,11 +92,9 @@ static int kthread(void *_create)
        schedule();
        ret = -EINTR;
+        if (!self.should_stop)
-        if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
-                __kthread_parkme(&self);
                ret = threadfn(data);
-        }
        /* we can't just return, we must preserve "self" on stack */
        do_exit(ret);
 }
@@ -219,7 +147,8 @@ static void create_kthread(struct kthread_create_info *create)
 * Returns a task_struct or ERR_PTR(-ENOMEM).
 */
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-                                           void *data, int node,
+                                           void *data,
+                                           int node,
                                           const char namefmt[],
                                           ...)
 {
@@ -256,13 +185,6 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind(struct task_struct *p, unsigned int cpu)
-{
-        /* It's safe because the task is inactive. */
-        do_set_cpus_allowed(p, cpumask_of(cpu));
-        p->flags |= PF_THREAD_BOUND;
-}
 /**
 * kthread_bind - bind a just-created kthread to a cpu.
 * @p: thread created by kthread_create().
@@ -279,110 +201,12 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
                WARN_ON(1);
                return;
        }
-        __kthread_bind(p, cpu);
-}
-EXPORT_SYMBOL(kthread_bind);
-/**
- * kthread_create_on_cpu - Create a cpu bound kthread
- * @threadfn: the function to run until signal_pending(current).
- * @data: data ptr for @threadfn.
- * @cpu: The cpu on which the thread should be bound,
- * @namefmt: printf-style name for the thread. Format is restricted
- *           to "name.*%u". Code fills in cpu number.
- *
- * Description: This helper function creates and names a kernel thread
- * The thread will be woken and put into park mode.
- */
-struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
-                                          void *data, unsigned int cpu,
-                                          const char *namefmt)
-{
-        struct task_struct *p;
-        p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
-                                   cpu);
-        if (IS_ERR(p))
-                return p;
-        set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
-        to_kthread(p)->cpu = cpu;
-        /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
-        kthread_park(p);
-        return p;
-}
-static struct kthread *task_get_live_kthread(struct task_struct *k)
+        /* It's safe because the task is inactive. */
-{
+        do_set_cpus_allowed(p, cpumask_of(cpu));
-        struct kthread *kthread;
+        p->flags |= PF_THREAD_BOUND;
-        get_task_struct(k);
-        kthread = to_kthread(k);
-        /* It might have exited */
-        barrier();
-        if (k->vfork_done != NULL)
-                return kthread;
-        return NULL;
-}
-/**
- * kthread_unpark - unpark a thread created by kthread_create().
- * @k:          thread created by kthread_create().
- *
- * Sets kthread_should_park() for @k to return false, wakes it, and
- * waits for it to return. If the thread is marked percpu then its
- * bound to the cpu again.
- */
-void kthread_unpark(struct task_struct *k)
-{
-        struct kthread *kthread = task_get_live_kthread(k);
-        if (kthread) {
-                clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
-                /*
-                 * We clear the IS_PARKED bit here as we don't wait
-                 * until the task has left the park code. So if we'd
-                 * park before that happens we'd see the IS_PARKED bit
-                 * which might be about to be cleared.
-                 */
-                if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
-                        if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
-                                __kthread_bind(k, kthread->cpu);
-                        wake_up_process(k);
-                }
-        }
-        put_task_struct(k);
-}
-/**
- * kthread_park - park a thread created by kthread_create().
- * @k: thread created by kthread_create().
- *
- * Sets kthread_should_park() for @k to return true, wakes it, and
- * waits for it to return. This can also be called after kthread_create()
- * instead of calling wake_up_process(): the thread will park without
- * calling threadfn().
- *
- * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
- * If called by the kthread itself just the park bit is set.
- */
-int kthread_park(struct task_struct *k)
-{
-        struct kthread *kthread = task_get_live_kthread(k);
-        int ret = -ENOSYS;
-        if (kthread) {
-                if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
-                        set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
-                        if (k != current) {
-                                wake_up_process(k);
-                                wait_for_completion(&kthread->parked);
-                        }
-                }
-                ret = 0;
-        }
-        put_task_struct(k);
-        return ret;
 }
+EXPORT_SYMBOL(kthread_bind);
 /**
 * kthread_stop - stop a thread created by kthread_create().
@@ -401,13 +225,16 @@ int kthread_park(struct task_struct *k)
 */
 int kthread_stop(struct task_struct *k)
 {
-        struct kthread *kthread = task_get_live_kthread(k);
+        struct kthread *kthread;
        int ret;
        trace_sched_kthread_stop(k);
-        if (kthread) {
+        get_task_struct(k);
-                set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
-                clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+        kthread = to_kthread(k);
+        barrier(); /* it might have exited */
+        if (k->vfork_done != NULL) {
+                kthread->should_stop = 1;
                wake_up_process(k);
                wait_for_completion(&kthread->exited);
        }
@@ -428,9 +255,9 @@ int kthreadd(void *unused)
        set_task_comm(tsk, "kthreadd");
        ignore_signals(tsk);
        set_cpus_allowed_ptr(tsk, cpu_all_mask);
-        set_mems_allowed(node_states[N_MEMORY]);
+        set_mems_allowed(node_states[N_HIGH_MEMORY]);
-        current->flags |= PF_NOFREEZE;
+        current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
@@ -508,12 +335,16 @@ repeat:
                                        struct kthread_work, node);
                list_del_init(&work->node);
        }
-        worker->current_work = work;
        spin_unlock_irq(&worker->lock);
        if (work) {
                __set_current_state(TASK_RUNNING);
                work->func(work);
+                smp_wmb();      /* wmb worker-b0 paired with flush-b1 */
+                work->done_seq = work->queue_seq;
+                smp_mb();       /* mb worker-b1 paired with flush-b0 */
+                if (atomic_read(&work->flushing))
+                        wake_up_all(&work->done);
        } else if (!freezing(current))
                schedule();
@@ -522,19 +353,6 @@ repeat:
 }
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
-/* insert @work before @pos in @worker */
-static void insert_kthread_work(struct kthread_worker *worker,
-                               struct kthread_work *work,
-                               struct list_head *pos)
-{
-        lockdep_assert_held(&worker->lock);
-        list_add_tail(&work->node, pos);
-        work->worker = worker;
-        if (likely(worker->task))
-                wake_up_process(worker->task);
-}
 /**
 * queue_kthread_work - queue a kthread_work
 * @worker: target kthread_worker
@@ -552,7 +370,10 @@ bool queue_kthread_work(struct kthread_worker *worker,
        spin_lock_irqsave(&worker->lock, flags);
        if (list_empty(&work->node)) {
-                insert_kthread_work(worker, work, &worker->work_list);
+                list_add_tail(&work->node, &worker->work_list);
+                work->queue_seq++;
+                if (likely(worker->task))
+                        wake_up_process(worker->task);
                ret = true;
        }
        spin_unlock_irqrestore(&worker->lock, flags);
@@ -560,18 +381,6 @@ bool queue_kthread_work(struct kthread_worker *worker,
 }
 EXPORT_SYMBOL_GPL(queue_kthread_work);
-struct kthread_flush_work {
-        struct kthread_work     work;
-        struct completion       done;
-};
-static void kthread_flush_work_fn(struct kthread_work *work)
-{
-        struct kthread_flush_work *fwork =
-                container_of(work, struct kthread_flush_work, work);
-        complete(&fwork->done);
-}
 /**
 * flush_kthread_work - flush a kthread_work
 * @work: work to flush
@@ -580,38 +389,40 @@ static void kthread_flush_work_fn(struct kthread_work *work)
 */
 void flush_kthread_work(struct kthread_work *work)
 {
-        struct kthread_flush_work fwork = {
+        int seq = work->queue_seq;
-                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
-                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
-        };
-        struct kthread_worker *worker;
-        bool noop = false;
-retry:
+        atomic_inc(&work->flushing);
-        worker = work->worker;
-        if (!worker)
-                return;
-        spin_lock_irq(&worker->lock);
-        if (work->worker != worker) {
-                spin_unlock_irq(&worker->lock);
-                goto retry;
-        }
-        if (!list_empty(&work->node))
+        /*
-                insert_kthread_work(worker, &fwork.work, work->node.next);
+         * mb flush-b0 paired with worker-b1, to make sure either
-        else if (worker->current_work == work)
+         * worker sees the above increment or we see done_seq update.
-                insert_kthread_work(worker, &fwork.work, worker->work_list.next);
+         */
-        else
+        smp_mb__after_atomic_inc();
-                noop = true;
-        spin_unlock_irq(&worker->lock);
+        /* A - B <= 0 tests whether B is in front of A regardless of overflow */
+        wait_event(work->done, seq - work->done_seq <= 0);
+        atomic_dec(&work->flushing);
-        if (!noop)
+        /*
-                wait_for_completion(&fwork.done);
+         * rmb flush-b1 paired with worker-b0, to make sure our caller
+         * sees every change made by work->func().
+         */
+        smp_mb__after_atomic_dec();
 }
 EXPORT_SYMBOL_GPL(flush_kthread_work);
+struct kthread_flush_work {
+        struct kthread_work     work;
+        struct completion       done;
+};
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+        struct kthread_flush_work *fwork =
+                container_of(work, struct kthread_flush_work, work);
+        complete(&fwork->done);
+}
 /**
 * flush_kthread_worker - flush all current works on a kthread_worker
 * @worker: worker to flush
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a462b317f9a..376066e1041 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -53,12 +53,12 @@
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/proc_fs.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/list.h>
 #include <linux/stacktrace.h>
-static DEFINE_RAW_SPINLOCK(latency_lock);
+static DEFINE_SPINLOCK(latency_lock);
 #define MAXLR 128
 static struct latency_record latency_record[MAXLR];
@@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p)
        if (!latencytop_enabled)
                return;
-        raw_spin_lock_irqsave(&latency_lock, flags);
+        spin_lock_irqsave(&latency_lock, flags);
        memset(&p->latency_record, 0, sizeof(p->latency_record));
        p->latency_record_count = 0;
-        raw_spin_unlock_irqrestore(&latency_lock, flags);
+        spin_unlock_irqrestore(&latency_lock, flags);
 }
 static void clear_global_latency_tracing(void)
 {
        unsigned long flags;
-        raw_spin_lock_irqsave(&latency_lock, flags);
+        spin_lock_irqsave(&latency_lock, flags);
        memset(&latency_record, 0, sizeof(latency_record));
-        raw_spin_unlock_irqrestore(&latency_lock, flags);
+        spin_unlock_irqrestore(&latency_lock, flags);
 }
 static void __sched
@@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
        lat.max = usecs;
        store_stacktrace(tsk, &lat);
-        raw_spin_lock_irqsave(&latency_lock, flags);
+        spin_lock_irqsave(&latency_lock, flags);
        account_global_scheduler_latency(tsk, &lat);
@@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
        memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
 out_unlock:
-        raw_spin_unlock_irqrestore(&latency_lock, flags);
+        spin_unlock_irqrestore(&latency_lock, flags);
 }
 static int lstats_show(struct seq_file *m, void *v)
diff --git a/kernel/lglock.c b/kernel/lglock.c
deleted file mode 100644
index 6535a667a5a..00000000000
--- a/kernel/lglock.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/* See include/linux/lglock.h for description */
-#include <linux/module.h>
-#include <linux/lglock.h>
-#include <linux/cpu.h>
-#include <linux/string.h>
-/*
- * Note there is no uninit, so lglocks cannot be defined in
- * modules (but it's fine to use them from there)
- * Could be added though, just undo lg_lock_init
- */
-void lg_lock_init(struct lglock *lg, char *name)
-{
-        LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
-}
-EXPORT_SYMBOL(lg_lock_init);
-void lg_local_lock(struct lglock *lg)
-{
-        arch_spinlock_t *lock;
-        preempt_disable();
-        rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
-        lock = this_cpu_ptr(lg->lock);
-        arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock);
-void lg_local_unlock(struct lglock *lg)
-{
-        arch_spinlock_t *lock;
-        rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
-        lock = this_cpu_ptr(lg->lock);
-        arch_spin_unlock(lock);
-        preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock);
-void lg_local_lock_cpu(struct lglock *lg, int cpu)
-{
-        arch_spinlock_t *lock;
-        preempt_disable();
-        rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
-        lock = per_cpu_ptr(lg->lock, cpu);
-        arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock_cpu);
-void lg_local_unlock_cpu(struct lglock *lg, int cpu)
-{
-        arch_spinlock_t *lock;
-        rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
-        lock = per_cpu_ptr(lg->lock, cpu);
-        arch_spin_unlock(lock);
-        preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock_cpu);
-void lg_global_lock(struct lglock *lg)
-{
-        int i;
-        preempt_disable();
-        rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
-        for_each_possible_cpu(i) {
-                arch_spinlock_t *lock;
-                lock = per_cpu_ptr(lg->lock, i);
-                arch_spin_lock(lock);
-        }
-}
-EXPORT_SYMBOL(lg_global_lock);
-void lg_global_unlock(struct lglock *lg)
-{
-        int i;
-        rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
-        for_each_possible_cpu(i) {
-                arch_spinlock_t *lock;
-                lock = per_cpu_ptr(lg->lock, i);
-                arch_spin_unlock(lock);
-        }
-        preempt_enable();
-}
-EXPORT_SYMBOL(lg_global_unlock);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b2350..447960603fb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -97,13 +97,8 @@ static int graph_lock(void)
 static inline int graph_unlock(void)
 {
-        if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) {
+        if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
-                /*
-                 * The lockdep graph lock isn't locked while we expect it to
-                 * be, we're confused now, bye!
-                 */
                return DEBUG_LOCKS_WARN_ON(1);
-        }
        current->lockdep_recursion--;
        arch_spin_unlock(&lockdep_lock);
@@ -140,9 +135,6 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
 static inline struct lock_class *hlock_class(struct held_lock *hlock)
 {
        if (!hlock->class_idx) {
-                /*
-                 * Someone passed in garbage, we give up.
-                 */
                DEBUG_LOCKS_WARN_ON(1);
                return NULL;
        }
@@ -431,7 +423,6 @@ unsigned int max_lockdep_depth;
 * about it later on, in lockdep_info().
 */
 static int lockdep_init_error;
-static const char *lock_init_error;
 static unsigned long lockdep_init_trace_data[20];
 static struct stack_trace lockdep_init_trace = {
        .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
@@ -500,32 +491,36 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
        usage[i] = '\0';
 }
-static void __print_lock_name(struct lock_class *class)
+static int __print_lock_name(struct lock_class *class)
 {
        char str[KSYM_NAME_LEN];
        const char *name;
        name = class->name;
-        if (!name) {
+        if (!name)
                name = __get_key_name(class->key, str);
-                printk("%s", name);
-        } else {
+        return printk("%s", name);
-                printk("%s", name);
-                if (class->name_version > 1)
-                        printk("#%d", class->name_version);
-                if (class->subclass)
-                        printk("/%d", class->subclass);
-        }
 }
 static void print_lock_name(struct lock_class *class)
 {
-        char usage[LOCK_USAGE_CHARS];
+        char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
+        const char *name;
        get_usage_chars(class, usage);
-        printk(" (");
+        name = class->name;
-        __print_lock_name(class);
+        if (!name) {
+                name = __get_key_name(class->key, str);
+                printk(" (%s", name);
+        } else {
+                printk(" (%s", name);
+                if (class->name_version > 1)
+                        printk("#%d", class->name_version);
+                if (class->subclass)
+                        printk("/%d", class->subclass);
+        }
        printk("){%s}", usage);
 }
@@ -565,12 +560,11 @@ static void lockdep_print_held_locks(struct task_struct *curr)
        }
 }
-static void print_kernel_ident(void)
+static void print_kernel_version(void)
 {
-        printk("%s %.*s %s\n", init_utsname()->release,
+        printk("%s %.*s\n", init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
-                init_utsname()->version,
+                init_utsname()->version);
-                print_tainted());
 }
 static int very_verbose(struct lock_class *class)
@@ -654,7 +648,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        if (unlikely(!lockdep_initialized)) {
                lockdep_init();
                lockdep_init_error = 1;
-                lock_init_error = lock->name;
                save_stack_trace(&lockdep_init_trace);
        }
 #endif
@@ -695,10 +688,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
         */
        list_for_each_entry(class, hash_head, hash_entry) {
                if (class->key == key) {
-                        /*
-                         * Huh! same key, different name? Did someone trample
-                         * on some memory? We're most confused.
-                         */
                        WARN_ON_ONCE(class->name != lock->name);
                        return class;
                }
@@ -722,7 +711,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        class = look_up_lock_class(lock, subclass);
        if (likely(class))
-                goto out_set_class_cache;
+                return class;
        /*
         * Debug-check: all keys must be persistent!
@@ -807,16 +796,11 @@ out_unlock_set:
        graph_unlock();
        raw_local_irq_restore(flags);
-out_set_class_cache:
        if (!subclass || force)
                lock->class_cache[0] = class;
        else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
                lock->class_cache[subclass] = class;
-        /*
-         * Hash collision, did we smoke some? We found a class with a matching
-         * hash but the subclass -- which is hashed in -- didn't match.
-         */
        if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
                return NULL;
@@ -943,7 +927,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
        unsigned long nr;
        nr = lock - list_entries;
-        WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+        WARN_ON(nr >= nr_list_entries);
        lock->parent = parent;
        lock->class->dep_gen_id = lockdep_dependency_gen_id;
 }
@@ -953,7 +937,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
        unsigned long nr;
        nr = lock - list_entries;
-        WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+        WARN_ON(nr >= nr_list_entries);
        return lock->class->dep_gen_id == lockdep_dependency_gen_id;
 }
@@ -1146,11 +1130,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
        if (debug_locks_silent)
                return 0;
-        printk("\n");
+        printk("\n=======================================================\n");
-        printk("======================================================\n");
+        printk(  "[ INFO: possible circular locking dependency detected ]\n");
-        printk("[ INFO: possible circular locking dependency detected ]\n");
+        print_kernel_version();
-        print_kernel_ident();
+        printk(  "-------------------------------------------------------\n");
-        printk("-------------------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, task_pid_nr(curr));
        print_lock(check_src);
@@ -1214,9 +1197,6 @@ static noinline int print_bfs_bug(int ret)
        if (!debug_locks_off_graph_unlock())
                return 0;
-        /*
-         * Breadth-first-search failed, graph got corrupted?
-         */
        WARN(1, "lockdep bfs error:%d\n", ret);
        return 0;
@@ -1484,12 +1464,11 @@ print_bad_irq_dependency(struct task_struct *curr,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        printk("\n");
+        printk("\n======================================================\n");
-        printk("======================================================\n");
+        printk(  "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
-        printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
                irqclass, irqclass);
-        print_kernel_ident();
+        print_kernel_version();
-        printk("------------------------------------------------------\n");
+        printk(  "------------------------------------------------------\n");
        printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
                curr->comm, task_pid_nr(curr),
                curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1714,11 +1693,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        printk("\n");
+        printk("\n=============================================\n");
-        printk("=============================================\n");
+        printk(  "[ INFO: possible recursive locking detected ]\n");
-        printk("[ INFO: possible recursive locking detected ]\n");
+        print_kernel_version();
-        print_kernel_ident();
+        printk(  "---------------------------------------------\n");
-        printk("---------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, task_pid_nr(curr));
        print_lock(next);
@@ -1967,11 +1945,6 @@ out_bug:
        if (!debug_locks_off_graph_unlock())
                return 0;
-        /*
-         * Clearly we all shouldn't be here, but since we made it we
-         * can reliable say we messed up our state. See the above two
-         * gotos for reasons why we could possibly end up here.
-         */
        WARN_ON(1);
        return 0;
@@ -2003,11 +1976,6 @@ static inline int lookup_chain_cache(struct task_struct *curr,
        struct held_lock *hlock_curr, *hlock_next;
        int i, j;
-        /*
-         * We might need to take the graph lock, ensure we've got IRQs
-         * disabled to make this an IRQ-safe lock.. for recursion reasons
-         * lockdep won't complain about its own locking errors.
-         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
        /*
@@ -2159,10 +2127,6 @@ static void check_chain_key(struct task_struct *curr)
                hlock = curr->held_locks + i;
                if (chain_key != hlock->prev_chain_key) {
                        debug_locks_off();
-                        /*
-                         * We got mighty confused, our chain keys don't match
-                         * with what we expect, someone trample on our task state?
-                         */
                        WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
                                curr->lockdep_depth, i,
                                (unsigned long long)chain_key,
@@ -2170,9 +2134,6 @@ static void check_chain_key(struct task_struct *curr)
                        return;
                }
                id = hlock->class_idx - 1;
-                /*
-                 * Whoops ran out of static storage again?
-                 */
                if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
                        return;
@@ -2184,10 +2145,6 @@ static void check_chain_key(struct task_struct *curr)
        }
        if (chain_key != curr->curr_chain_key) {
                debug_locks_off();
-                /*
-                 * More smoking hash instead of calculating it, damn see these
-                 * numbers float.. I bet that a pink elephant stepped on my memory.
-                 */
                WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
                        curr->lockdep_depth, i,
                        (unsigned long long)chain_key,
@@ -2221,11 +2178,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        printk("\n");
+        printk("\n=================================\n");
-        printk("=================================\n");
+        printk(  "[ INFO: inconsistent lock state ]\n");
-        printk("[ INFO: inconsistent lock state ]\n");
+        print_kernel_version();
-        print_kernel_ident();
+        printk(  "---------------------------------\n");
-        printk("---------------------------------\n");
        printk("inconsistent {%s} -> {%s} usage.\n",
                usage_str[prev_bit], usage_str[new_bit]);
@@ -2286,11 +2242,10 @@ print_irq_inversion_bug(struct task_struct *curr,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        printk("\n");
+        printk("\n=========================================================\n");
-        printk("=========================================================\n");
+        printk(  "[ INFO: possible irq lock inversion dependency detected ]\n");
-        printk("[ INFO: possible irq lock inversion dependency detected ]\n");
+        print_kernel_version();
-        print_kernel_ident();
+        printk(  "---------------------------------------------------------\n");
-        printk("---------------------------------------------------------\n");
        printk("%s/%d just changed the state of lock:\n",
                curr->comm, task_pid_nr(curr));
        print_lock(this);
@@ -2571,24 +2526,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
                return;
        }
-        /*
-         * We're enabling irqs and according to our state above irqs weren't
-         * already enabled, yet we find the hardware thinks they are in fact
-         * enabled.. someone messed up their IRQ state tracing.
-         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return;
-        /*
-         * See the fine text that goes along with this variable definition.
-         */
        if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
                return;
-        /*
-         * Can't allow enabling interrupts while in an interrupt handler,
-         * that's general bad form and such. Recursion, limited stack etc..
-         */
        if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
                return;
@@ -2616,10 +2559,6 @@ void trace_hardirqs_off_caller(unsigned long ip)
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
-        /*
-         * So we're supposed to get called after you mask local IRQs, but for
-         * some reason the hardware doesn't quite think you did a proper job.
-         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return;
@@ -2652,10 +2591,6 @@ void trace_softirqs_on(unsigned long ip)
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
-        /*
-         * We fancy IRQs being disabled here, see softirq.c, avoids
-         * funny state and nesting things.
-         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return;
@@ -2692,9 +2627,6 @@ void trace_softirqs_off(unsigned long ip)
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
-        /*
-         * We fancy IRQs being disabled here, see softirq.c
-         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return;
@@ -2706,9 +2638,6 @@ void trace_softirqs_off(unsigned long ip)
                curr->softirq_disable_ip = ip;
                curr->softirq_disable_event = ++curr->irq_events;
                debug_atomic_inc(softirqs_off_events);
-                /*
-                 * Whoops, we wanted softirqs off, so why aren't they?
-                 */
                DEBUG_LOCKS_WARN_ON(!softirq_count());
        } else
                debug_atomic_inc(redundant_softirqs_off);
@@ -2733,9 +2662,6 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
        if (!(gfp_mask & __GFP_FS))
                return;
-        /*
-         * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
-         */
        if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
                return;
@@ -2848,13 +2774,13 @@ static int separate_irq_context(struct task_struct *curr,
        return 0;
 }
-#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+#else
 static inline
 int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                enum lock_usage_bit new_bit)
 {
-        WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
+        WARN_ON(1);
        return 1;
 }
@@ -2874,7 +2800,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask)
 {
 }
-#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+#endif
 /*
 * Mark a lock with a usage bit, and validate the state transition:
@@ -2960,9 +2886,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
        lock->cpu = raw_smp_processor_id();
 #endif
-        /*
-         * Can't be having no nameless bastards around this place!
-         */
        if (DEBUG_LOCKS_WARN_ON(!name)) {
                lock->name = "NULL";
                return;
@@ -2970,9 +2893,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
        lock->name = name;
-        /*
-         * No key, no joy, we need to hash something.
-         */
        if (DEBUG_LOCKS_WARN_ON(!key))
                return;
        /*
@@ -2980,9 +2900,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
         */
        if (!static_obj(key)) {
                printk("BUG: key %p not in .data!\n", key);
-                /*
-                 * What it says above ^^^^^, I suggest you read it.
-                 */
                DEBUG_LOCKS_WARN_ON(1);
                return;
        }
@@ -2998,42 +2915,6 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
 struct lock_class_key __lockdep_no_validate__;
-static int
-print_lock_nested_lock_not_held(struct task_struct *curr,
-                                struct held_lock *hlock,
-                                unsigned long ip)
-{
-        if (!debug_locks_off())
-                return 0;
-        if (debug_locks_silent)
-                return 0;
-        printk("\n");
-        printk("==================================\n");
-        printk("[ BUG: Nested lock was not taken ]\n");
-        print_kernel_ident();
-        printk("----------------------------------\n");
-        printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
-        print_lock(hlock);
-        printk("\nbut this task is not holding:\n");
-        printk("%s\n", hlock->nest_lock->name);
-        printk("\nstack backtrace:\n");
-        dump_stack();
-        printk("\nother info that might help us debug this:\n");
-        lockdep_print_held_locks(curr);
-        printk("\nstack backtrace:\n");
-        dump_stack();
-        return 0;
-}
-static int __lock_is_held(struct lockdep_map *lock);
 /*
 * This gets called for every mutex_lock*()/spin_lock*() operation.
 * We maintain the dependency maps and validate the locking attempt:
@@ -3057,11 +2938,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        if (unlikely(!debug_locks))
                return 0;
-        /*
-         * Lockdep should run with IRQs disabled, otherwise we could
-         * get an interrupt which would want to take locks, which would
-         * end up in lockdep and have you got a head-ache already?
-         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
@@ -3093,9 +2969,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
         * dependency checks are done)
         */
        depth = curr->lockdep_depth;
-        /*
-         * Ran out of static storage for our per-task lock stack again have we?
-         */
        if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
                return 0;
@@ -3114,10 +2987,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        }
        hlock = curr->held_locks + depth;
-        /*
-         * Plain impossible, we just registered it and checked it weren't no
-         * NULL like.. I bet this mushroom I ate was good!
-         */
        if (DEBUG_LOCKS_WARN_ON(!class))
                return 0;
        hlock->class_idx = class_idx;
@@ -3152,17 +3021,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
         * the hash, not class->key.
         */
        id = class - lock_classes;
-        /*
-         * Whoops, we did it again.. ran straight out of our static allocation.
-         */
        if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
                return 0;
        chain_key = curr->curr_chain_key;
        if (!depth) {
-                /*
-                 * How can we have a chain hash when we ain't got no keys?!
-                 */
                if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
                        return 0;
                chain_head = 1;
@@ -3175,9 +3038,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        }
        chain_key = iterate_chain_key(chain_key, id);
-        if (nest_lock && !__lock_is_held(nest_lock))
-                return print_lock_nested_lock_not_held(curr, hlock, ip);
        if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
                return 0;
@@ -3211,11 +3071,9 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
        if (debug_locks_silent)
                return 0;
-        printk("\n");
+        printk("\n=====================================\n");
-        printk("=====================================\n");
+        printk(  "[ BUG: bad unlock balance detected! ]\n");
-        printk("[ BUG: bad unlock balance detected! ]\n");
+        printk(  "-------------------------------------\n");
-        print_kernel_ident();
-        printk("-------------------------------------\n");
        printk("%s/%d is trying to release lock (",
                curr->comm, task_pid_nr(curr));
        print_lockdep_cache(lock);
@@ -3239,9 +3097,6 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
 {
        if (unlikely(!debug_locks))
                return 0;
-        /*
-         * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
-         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
@@ -3271,11 +3126,6 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
                if (!class)
                        return 0;
-                /*
-                 * References, but not a lock we're actually ref-counting?
-                 * State got messed up, follow the sites that change ->references
-                 * and try to make sense of it.
-                 */
                if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
                        return 0;
@@ -3298,10 +3148,6 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
        int i;
        depth = curr->lockdep_depth;
-        /*
-         * This function is about (re)setting the class of a held lock,
-         * yet we're not actually holding any locks. Naughty user!
-         */
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return 0;
@@ -3337,10 +3183,6 @@ found_it:
                        return 0;
        }
-        /*
-         * I took it apart and put it back together again, except now I have
-         * these 'spare' parts.. where shall I put them.
-         */
        if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
                return 0;
        return 1;
@@ -3365,10 +3207,6 @@ lock_release_non_nested(struct task_struct *curr,
         * of held locks:
         */
        depth = curr->lockdep_depth;
-        /*
-         * So we're all set to release this lock.. wait what lock? We don't
-         * own any locks, you've been drinking again?
-         */
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return 0;
@@ -3421,10 +3259,6 @@ found_it:
                        return 0;
        }
-        /*
-         * We had N bottles of beer on the wall, we drank one, but now
-         * there's not N-1 bottles of beer left on the wall...
-         */
        if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
                return 0;
        return 1;
@@ -3455,9 +3289,6 @@ static int lock_release_nested(struct task_struct *curr,
                return lock_release_non_nested(curr, lock, ip);
        curr->lockdep_depth--;
-        /*
-         * No more locks, but somehow we've got hash left over, who left it?
-         */
        if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
                return 0;
@@ -3540,13 +3371,10 @@ static void check_flags(unsigned long flags)
         * check if not in hardirq contexts:
         */
        if (!hardirq_count()) {
-                if (softirq_count()) {
+                if (softirq_count())
-                        /* like the above, but with softirqs */
                        DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
-                } else {
+                else
-                        /* lick the above, does it taste good? */
                        DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
-                }
        }
        if (!debug_locks)
@@ -3656,11 +3484,9 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
        if (debug_locks_silent)
                return 0;
-        printk("\n");
+        printk("\n=================================\n");
-        printk("=================================\n");
+        printk(  "[ BUG: bad contention detected! ]\n");
-        printk("[ BUG: bad contention detected! ]\n");
+        printk(  "---------------------------------\n");
-        print_kernel_ident();
-        printk("---------------------------------\n");
        printk("%s/%d is trying to contend lock (",
                curr->comm, task_pid_nr(curr));
        print_lockdep_cache(lock);
@@ -3686,10 +3512,6 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
        int i, contention_point, contending_point;
        depth = curr->lockdep_depth;
-        /*
-         * Whee, we contended on this lock, except it seems we're not
-         * actually trying to acquire anything much at all..
-         */
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return;
@@ -3739,10 +3561,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
        int i, cpu;
        depth = curr->lockdep_depth;
-        /*
-         * Yay, we acquired ownership of this lock we didn't try to
-         * acquire, how the heck did that happen?
-         */
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return;
@@ -3947,12 +3765,8 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                                match |= class == lock->class_cache[j];
                        if (unlikely(match)) {
-                                if (debug_locks_off_graph_unlock()) {
+                                if (debug_locks_off_graph_unlock())
-                                        /*
-                                         * We all just reset everything, how did it match?
-                                         */
                                        WARN_ON(1);
-                                }
                                goto out_restore;
                        }
                }
@@ -4015,8 +3829,7 @@ void __init lockdep_info(void)
 #ifdef CONFIG_DEBUG_LOCKDEP
        if (lockdep_init_error) {
-                printk("WARNING: lockdep init error! lock-%s was acquired"
+                printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n");
-                        "before lockdep_init\n", lock_init_error);
                printk("Call stack leading to lockdep invocation was:\n");
                print_stack_trace(&lockdep_init_trace, 0);
        }
@@ -4032,11 +3845,9 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
        if (debug_locks_silent)
                return;
-        printk("\n");
+        printk("\n=========================\n");
-        printk("=========================\n");
+        printk(  "[ BUG: held lock freed! ]\n");
-        printk("[ BUG: held lock freed! ]\n");
+        printk(  "-------------------------\n");
-        print_kernel_ident();
-        printk("-------------------------\n");
        printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
                curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
        print_lock(hlock);
@@ -4090,11 +3901,9 @@ static void print_held_locks_bug(struct task_struct *curr)
        if (debug_locks_silent)
                return;
-        printk("\n");
+        printk("\n=====================================\n");
-        printk("=====================================\n");
+        printk(  "[ BUG: lock held at task exit time! ]\n");
-        printk("[ BUG: lock held at task exit time! ]\n");
+        printk(  "-------------------------------------\n");
-        print_kernel_ident();
-        printk("-------------------------------------\n");
        printk("%s/%d is exiting with locks still held!\n",
                curr->comm, task_pid_nr(curr));
        lockdep_print_held_locks(curr);
@@ -4188,18 +3997,16 @@ void lockdep_sys_exit(void)
        if (unlikely(curr->lockdep_depth)) {
                if (!debug_locks_off())
                        return;
-                printk("\n");
+                printk("\n================================================\n");
-                printk("================================================\n");
+                printk(  "[ BUG: lock held when returning to user space! ]\n");
-                printk("[ BUG: lock held when returning to user space! ]\n");
+                printk(  "------------------------------------------------\n");
-                print_kernel_ident();
-                printk("------------------------------------------------\n");
                printk("%s/%d is leaving the kernel with locks still held!\n",
                                curr->comm, curr->pid);
                lockdep_print_held_locks(curr);
        }
 }
-void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
+void lockdep_rcu_dereference(const char *file, const int line)
 {
        struct task_struct *curr = current;
@@ -4208,44 +4015,15 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
                return;
 #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
        /* Note: the following can be executed concurrently, so be careful. */
-        printk("\n");
+        printk("\n===================================================\n");
-        printk("===============================\n");
+        printk(  "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
-        printk("[ INFO: suspicious RCU usage. ]\n");
+        printk(  "---------------------------------------------------\n");
-        print_kernel_ident();
+        printk("%s:%d invoked rcu_dereference_check() without protection!\n",
-        printk("-------------------------------\n");
+                        file, line);
-        printk("%s:%d %s!\n", file, line, s);
        printk("\nother info that might help us debug this:\n\n");
-        printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+        printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
-               !rcu_lockdep_current_cpu_online()
-                        ? "RCU used illegally from offline CPU!\n"
-                        : rcu_is_cpu_idle()
-                                ? "RCU used illegally from idle CPU!\n"
-                                : "",
-               rcu_scheduler_active, debug_locks);
-        /*
-         * If a CPU is in the RCU-free window in idle (ie: in the section
-         * between rcu_idle_enter() and rcu_idle_exit(), then RCU
-         * considers that CPU to be in an "extended quiescent state",
-         * which means that RCU will be completely ignoring that CPU.
-         * Therefore, rcu_read_lock() and friends have absolutely no
-         * effect on a CPU running in that state. In other words, even if
-         * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
-         * delete data structures out from under it.  RCU really has no
-         * choice here: we need to keep an RCU-free window in idle where
-         * the CPU may possibly enter into low power mode. This way we can
-         * notice an extended quiescent state to other CPUs that started a grace
-         * period. Otherwise we would delay any grace period as long as we run
-         * in the idle task.
-         *
-         * So complain bitterly if someone does call rcu_read_lock(),
-         * rcu_read_lock_bh() and so on from extended quiescent states.
-         */
-        if (rcu_is_cpu_idle())
-                printk("RCU used illegally from extended quiescent state!\n");
        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
        dump_stack();
 }
-EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
+EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index b2c71c5873e..71edd2f60c0 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -11,7 +11,7 @@
 * Code for /proc/lockdep and /proc/lockdep_stats:
 *
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
@@ -39,7 +39,7 @@ static void l_stop(struct seq_file *m, void *v)
 static void print_name(struct seq_file *m, struct lock_class *class)
 {
-        char str[KSYM_NAME_LEN];
+        char str[128];
        const char *name = class->name;
        if (!name) {
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
deleted file mode 100644
index 246b4c6e613..00000000000
--- a/kernel/modsign_certificate.S
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
-#ifndef SYMBOL_PREFIX
-#define ASM_SYMBOL(sym) sym
-#else
-#define PASTE2(x,y) x##y
-#define PASTE(x,y) PASTE2(x,y)
-#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
-#endif
-#define GLOBAL(name)    \
-        .globl ASM_SYMBOL(name);        \
-        ASM_SYMBOL(name):
-        .section ".init.data","aw"
-GLOBAL(modsign_certificate_list)
-        .incbin "signing_key.x509"
-        .incbin "extra_certificates"
-GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
deleted file mode 100644
index 2b6e69909c3..00000000000
--- a/kernel/modsign_pubkey.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Public keys for module signature verification
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <keys/asymmetric-type.h>
-#include "module-internal.h"
-struct key *modsign_keyring;
-extern __initdata const u8 modsign_certificate_list[];
-extern __initdata const u8 modsign_certificate_list_end[];
-/*
- * We need to make sure ccache doesn't cache the .o file as it doesn't notice
- * if modsign.pub changes.
- */
-static __initdata const char annoy_ccache[] = __TIME__ "foo";
-/*
- * Load the compiled-in keys
- */
-static __init int module_verify_init(void)
-{
-        pr_notice("Initialise module verification\n");
-        modsign_keyring = keyring_alloc(".module_sign",
-                                        KUIDT_INIT(0), KGIDT_INIT(0),
-                                        current_cred(),
-                                        ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                                         KEY_USR_VIEW | KEY_USR_READ),
-                                        KEY_ALLOC_NOT_IN_QUOTA, NULL);
-        if (IS_ERR(modsign_keyring))
-                panic("Can't allocate module signing keyring\n");
-        return 0;
-}
-/*
- * Must be initialised before we try and load the keys into the keyring.
- */
-device_initcall(module_verify_init);
-/*
- * Load the compiled-in keys
- */
-static __init int load_module_signing_keys(void)
-{
-        key_ref_t key;
-        const u8 *p, *end;
-        size_t plen;
-        pr_notice("Loading module verification certificates\n");
-        end = modsign_certificate_list_end;
-        p = modsign_certificate_list;
-        while (p < end) {
-                /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
-                 * than 256 bytes in size.
-                 */
-                if (end - p < 4)
-                        goto dodgy_cert;
-                if (p[0] != 0x30 &&
-                    p[1] != 0x82)
-                        goto dodgy_cert;
-                plen = (p[2] << 8) | p[3];
-                plen += 4;
-                if (plen > end - p)
-                        goto dodgy_cert;
-                key = key_create_or_update(make_key_ref(modsign_keyring, 1),
-                                           "asymmetric",
-                                           NULL,
-                                           p,
-                                           plen,
-                                           (KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                                           KEY_USR_VIEW,
-                                           KEY_ALLOC_NOT_IN_QUOTA);
-                if (IS_ERR(key))
-                        pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n",
-                               PTR_ERR(key));
-                else
-                        pr_notice("MODSIGN: Loaded cert '%s'\n",
-                                  key_ref_to_ptr(key)->description);
-                p += plen;
-        }
-        return 0;
-dodgy_cert:
-        pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n");
-        return 0;
-}
-late_initcall(load_module_signing_keys);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
deleted file mode 100644
index 24f9247b7d0..00000000000
--- a/kernel/module-internal.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Module internals
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-extern struct key *modsign_keyring;
-extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
index 250092c1d57..e0ddcece2be 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -16,12 +16,11 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/ftrace_event.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
-#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/sysfs.h>
 #include <linux/kernel.h>
@@ -29,7 +28,6 @@
 #include <linux/vmalloc.h>
 #include <linux/elf.h>
 #include <linux/proc_fs.h>
-#include <linux/security.h>
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
@@ -60,13 +58,16 @@
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
-#include <linux/fips.h>
-#include <uapi/linux/module.h>
-#include "module-internal.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt , a...)
+#endif
 #ifndef ARCH_SHF_SMALL
 #define ARCH_SHF_SMALL 0
 #endif
@@ -107,47 +108,9 @@ static LIST_HEAD(modules);
 struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
 #endif /* CONFIG_KGDB_KDB */
-#ifdef CONFIG_MODULE_SIG
-#ifdef CONFIG_MODULE_SIG_FORCE
-static bool sig_enforce = true;
-#else
-static bool sig_enforce = false;
-static int param_set_bool_enable_only(const char *val,
-                                      const struct kernel_param *kp)
-{
-        int err;
-        bool test;
-        struct kernel_param dummy_kp = *kp;
-        dummy_kp.arg = &test;
-        err = param_set_bool(val, &dummy_kp);
-        if (err)
-                return err;
-        /* Don't let them unset it once it's set! */
-        if (!test && sig_enforce)
-                return -EROFS;
-        if (test)
-                sig_enforce = true;
-        return 0;
-}
-static const struct kernel_param_ops param_ops_bool_enable_only = {
-        .set = param_set_bool_enable_only,
-        .get = param_get_bool,
-};
-#define param_check_bool_enable_only param_check_bool
-module_param(sig_enforce, bool_enable_only, 0644);
-#endif /* !CONFIG_MODULE_SIG_FORCE */
-#endif /* CONFIG_MODULE_SIG */
 /* Block module loading/unloading? */
 int modules_disabled = 0;
-core_param(nomodule, modules_disabled, bint, 0);
 /* Waiting for a module to finish initializing? */
 static DECLARE_WAIT_QUEUE_HEAD(module_wq);
@@ -175,10 +138,10 @@ struct load_info {
        unsigned long len;
        Elf_Shdr *sechdrs;
        char *secstrings, *strtab;
+        unsigned long *strmap;
        unsigned long symoffs, stroffs;
        struct _ddebug *debug;
        unsigned int num_debug;
-        bool sig_ok;
        struct {
                unsigned int sym, str, mod, vers, info, pcpu;
        } index;
@@ -375,6 +338,9 @@ static bool check_symbol(const struct symsearch *syms,
                        printk(KERN_WARNING "Symbol %s is being used "
                               "by a non-GPL module, which will not "
                               "be allowed in the future\n", fsa->name);
+                        printk(KERN_WARNING "Please see the file "
+                               "Documentation/feature-removal-schedule.txt "
+                               "in the kernel source tree for more details.\n");
                }
        }
@@ -444,7 +410,7 @@ const struct kernel_symbol *find_symbol(const char *name,
                return fsa.sym;
        }
-        pr_debug("Failed to find symbol %s\n", name);
+        DEBUGP("Failed to find symbol %s\n", name);
        return NULL;
 }
 EXPORT_SYMBOL_GPL(find_symbol);
@@ -634,11 +600,11 @@ static int already_uses(struct module *a, struct module *b)
        list_for_each_entry(use, &b->source_list, source_list) {
                if (use->source == a) {
-                        pr_debug("%s uses %s!\n", a->name, b->name);
+                        DEBUGP("%s uses %s!\n", a->name, b->name);
                        return 1;
                }
        }
-        pr_debug("%s does not use %s!\n", a->name, b->name);
+        DEBUGP("%s does not use %s!\n", a->name, b->name);
        return 0;
 }
@@ -653,7 +619,7 @@ static int add_module_usage(struct module *a, struct module *b)
 {
        struct module_use *use;
-        pr_debug("Allocating new usage for %s.\n", a->name);
+        DEBUGP("Allocating new usage for %s.\n", a->name);
        use = kmalloc(sizeof(*use), GFP_ATOMIC);
        if (!use) {
                printk(KERN_WARNING "%s: out of memory loading\n", a->name);
@@ -697,7 +663,7 @@ static void module_unload_free(struct module *mod)
        mutex_lock(&module_mutex);
        list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
                struct module *i = use->target;
-                pr_debug("%s unusing %s\n", mod->name, i->name);
+                DEBUGP("%s unusing %s\n", mod->name, i->name);
                module_put(i);
                list_del(&use->source_list);
                list_del(&use->target_list);
@@ -760,9 +726,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
        }
 }
-unsigned long module_refcount(struct module *mod)
+unsigned int module_refcount(struct module *mod)
 {
-        unsigned long incs = 0, decs = 0;
+        unsigned int incs = 0, decs = 0;
        int cpu;
        for_each_possible_cpu(cpu)
@@ -795,7 +761,7 @@ static void wait_for_zero_refcount(struct module *mod)
        /* Since we might sleep for some time, release the mutex first */
        mutex_unlock(&module_mutex);
        for (;;) {
-                pr_debug("Looking at refcount...\n");
+                DEBUGP("Looking at refcount...\n");
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (module_refcount(mod) == 0)
                        break;
@@ -838,7 +804,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
        if (mod->state != MODULE_STATE_LIVE) {
                /* FIXME: if (force), slam module count and wake up
                   waiter --RR */
-                pr_debug("%s already dying\n", mod->name);
+                DEBUGP("%s already dying\n", mod->name);
                ret = -EBUSY;
                goto out;
        }
@@ -888,7 +854,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
        struct module_use *use;
        int printed_something = 0;
-        seq_printf(m, " %lu ", module_refcount(mod));
+        seq_printf(m, " %u ", module_refcount(mod));
        /* Always include a trailing , so userspace can differentiate
           between this and the old multi-field proc format. */
@@ -938,41 +904,13 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
 static ssize_t show_refcnt(struct module_attribute *mattr,
                           struct module_kobject *mk, char *buffer)
 {
-        return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
+        return sprintf(buffer, "%u\n", module_refcount(mk->mod));
-}
-static struct module_attribute modinfo_refcnt =
-        __ATTR(refcnt, 0444, show_refcnt, NULL);
-void __module_get(struct module *module)
-{
-        if (module) {
-                preempt_disable();
-                __this_cpu_inc(module->refptr->incs);
-                trace_module_get(module, _RET_IP_);
-                preempt_enable();
-        }
 }
-EXPORT_SYMBOL(__module_get);
-bool try_module_get(struct module *module)
-{
-        bool ret = true;
-        if (module) {
-                preempt_disable();
-                if (likely(module_is_live(module))) {
-                        __this_cpu_inc(module->refptr->incs);
-                        trace_module_get(module, _RET_IP_);
-                } else
-                        ret = false;
-                preempt_enable();
+static struct module_attribute refcnt = {
-        }
+        .attr = { .name = "refcnt", .mode = 0444 },
-        return ret;
+        .show = show_refcnt,
-}
+};
-EXPORT_SYMBOL(try_module_get);
 void module_put(struct module *module)
 {
@@ -1013,26 +951,6 @@ static inline int module_unload_init(struct module *mod)
 }
 #endif /* CONFIG_MODULE_UNLOAD */
-static size_t module_flags_taint(struct module *mod, char *buf)
-{
-        size_t l = 0;
-        if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
-                buf[l++] = 'P';
-        if (mod->taints & (1 << TAINT_OOT_MODULE))
-                buf[l++] = 'O';
-        if (mod->taints & (1 << TAINT_FORCED_MODULE))
-                buf[l++] = 'F';
-        if (mod->taints & (1 << TAINT_CRAP))
-                buf[l++] = 'C';
-        /*
-         * TAINT_FORCED_RMMOD: could be added.
-         * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
-         * apply to modules.
-         */
-        return l;
-}
 static ssize_t show_initstate(struct module_attribute *mattr,
                              struct module_kobject *mk, char *buffer)
 {
@@ -1052,8 +970,10 @@ static ssize_t show_initstate(struct module_attribute *mattr,
        return sprintf(buffer, "%s\n", state);
 }
-static struct module_attribute modinfo_initstate =
+static struct module_attribute initstate = {
-        __ATTR(initstate, 0444, show_initstate, NULL);
+        .attr = { .name = "initstate", .mode = 0444 },
+        .show = show_initstate,
+};
 static ssize_t store_uevent(struct module_attribute *mattr,
                            struct module_kobject *mk,
@@ -1066,50 +986,18 @@ static ssize_t store_uevent(struct module_attribute *mattr,
        return count;
 }
-struct module_attribute module_uevent =
+struct module_attribute module_uevent = {
-        __ATTR(uevent, 0200, NULL, store_uevent);
+        .attr = { .name = "uevent", .mode = 0200 },
+        .store = store_uevent,
-static ssize_t show_coresize(struct module_attribute *mattr,
+};
-                             struct module_kobject *mk, char *buffer)
-{
-        return sprintf(buffer, "%u\n", mk->mod->core_size);
-}
-static struct module_attribute modinfo_coresize =
-        __ATTR(coresize, 0444, show_coresize, NULL);
-static ssize_t show_initsize(struct module_attribute *mattr,
-                             struct module_kobject *mk, char *buffer)
-{
-        return sprintf(buffer, "%u\n", mk->mod->init_size);
-}
-static struct module_attribute modinfo_initsize =
-        __ATTR(initsize, 0444, show_initsize, NULL);
-static ssize_t show_taint(struct module_attribute *mattr,
-                          struct module_kobject *mk, char *buffer)
-{
-        size_t l;
-        l = module_flags_taint(mk->mod, buffer);
-        buffer[l++] = '\n';
-        return l;
-}
-static struct module_attribute modinfo_taint =
-        __ATTR(taint, 0444, show_taint, NULL);
 static struct module_attribute *modinfo_attrs[] = {
-        &module_uevent,
        &modinfo_version,
        &modinfo_srcversion,
-        &modinfo_initstate,
+        &initstate,
-        &modinfo_coresize,
+        &module_uevent,
-        &modinfo_initsize,
-        &modinfo_taint,
 #ifdef CONFIG_MODULE_UNLOAD
-        &modinfo_refcnt,
+        &refcnt,
 #endif
        NULL,
 };
@@ -1169,7 +1057,7 @@ static int check_version(Elf_Shdr *sechdrs,
                if (versions[i].crc == maybe_relocated(*crc, crc_owner))
                        return 1;
-                pr_debug("Found checksum %lX vs module %lX\n",
+                DEBUGP("Found checksum %lX vs module %lX\n",
                       maybe_relocated(*crc, crc_owner), versions[i].crc);
                goto bad_version;
        }
@@ -1946,7 +1834,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
                case SHN_COMMON:
                        /* We compiled with -fno-common.  These are not
                           supposed to happen.  */
-                        pr_debug("Common symbol: %s\n", name);
+                        DEBUGP("Common symbol: %s\n", name);
                        printk("%s: please compile with -fno-common\n",
                               mod->name);
                        ret = -ENOEXEC;
@@ -1954,7 +1842,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
                case SHN_ABS:
                        /* Don't need to do anything */
-                        pr_debug("Absolute symbol: 0x%08lx\n",
+                        DEBUGP("Absolute symbol: 0x%08lx\n",
                               (long)sym[i].st_value);
                        break;
@@ -1989,6 +1877,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
        return ret;
 }
+int __weak apply_relocate(Elf_Shdr *sechdrs,
+                          const char *strtab,
+                          unsigned int symindex,
+                          unsigned int relsec,
+                          struct module *me)
+{
+        pr_err("module %s: REL relocation unsupported\n", me->name);
+        return -ENOEXEC;
+}
+int __weak apply_relocate_add(Elf_Shdr *sechdrs,
+                              const char *strtab,
+                              unsigned int symindex,
+                              unsigned int relsec,
+                              struct module *me)
+{
+        pr_err("module %s: RELA relocation unsupported\n", me->name);
+        return -ENOEXEC;
+}
 static int apply_relocations(struct module *mod, const struct load_info *info)
 {
        unsigned int i;
@@ -2058,7 +1966,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
        for (i = 0; i < info->hdr->e_shnum; i++)
                info->sechdrs[i].sh_entsize = ~0UL;
-        pr_debug("Core section allocation order:\n");
+        DEBUGP("Core section allocation order:\n");
        for (m = 0; m < ARRAY_SIZE(masks); ++m) {
                for (i = 0; i < info->hdr->e_shnum; ++i) {
                        Elf_Shdr *s = &info->sechdrs[i];
@@ -2070,7 +1978,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
                            || strstarts(sname, ".init"))
                                continue;
                        s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
-                        pr_debug("\t%s\n", sname);
+                        DEBUGP("\t%s\n", name);
                }
                switch (m) {
                case 0: /* executable */
@@ -2087,7 +1995,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
                }
        }
-        pr_debug("Init section allocation order:\n");
+        DEBUGP("Init section allocation order:\n");
        for (m = 0; m < ARRAY_SIZE(masks); ++m) {
                for (i = 0; i < info->hdr->e_shnum; ++i) {
                        Elf_Shdr *s = &info->sechdrs[i];
@@ -2100,7 +2008,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
                                continue;
                        s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
                                         | INIT_OFFSET_MASK);
-                        pr_debug("\t%s\n", sname);
+                        DEBUGP("\t%s\n", sname);
                }
                switch (m) {
                case 0: /* executable */
@@ -2270,48 +2178,45 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
        return true;
 }
-/*
- * We only allocate and copy the strings needed by the parts of symtab
- * we keep.  This is simple, but has the effect of making multiple
- * copies of duplicates.  We could be more sophisticated, see
- * linux-kernel thread starting with
- * <73defb5e4bca04a6431392cc341112b1@localhost>.
- */
 static void layout_symtab(struct module *mod, struct load_info *info)
 {
        Elf_Shdr *symsect = info->sechdrs + info->index.sym;
        Elf_Shdr *strsect = info->sechdrs + info->index.str;
        const Elf_Sym *src;
-        unsigned int i, nsrc, ndst, strtab_size = 0;
+        unsigned int i, nsrc, ndst;
        /* Put symbol section at end of init part of module. */
        symsect->sh_flags |= SHF_ALLOC;
        symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
                                         info->index.sym) | INIT_OFFSET_MASK;
-        pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
+        DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
        src = (void *)info->hdr + symsect->sh_offset;
        nsrc = symsect->sh_size / sizeof(*src);
+        for (ndst = i = 1; i < nsrc; ++i, ++src)
-        /* Compute total space required for the core symbols' strtab. */
+                if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
-        for (ndst = i = 0; i < nsrc; i++) {
+                        unsigned int j = src->st_name;
-                if (i == 0 ||
-                    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+                        while (!__test_and_set_bit(j, info->strmap)
-                        strtab_size += strlen(&info->strtab[src[i].st_name])+1;
+                               && info->strtab[j])
-                        ndst++;
+                                ++j;
+                        ++ndst;
                }
-        }
        /* Append room for core symbols at end of core part. */
        info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
-        info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+        mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
-        mod->core_size += strtab_size;
        /* Put string table section at end of init part of module. */
        strsect->sh_flags |= SHF_ALLOC;
        strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
                                         info->index.str) | INIT_OFFSET_MASK;
-        pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+        DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
+        /* Append room for core symbols' strings at end of core part. */
+        info->stroffs = mod->core_size;
+        __set_bit(0, info->strmap);
+        mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
 }
 static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2332,18 +2237,22 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
                mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
        mod->core_symtab = dst = mod->module_core + info->symoffs;
-        mod->core_strtab = s = mod->module_core + info->stroffs;
        src = mod->symtab;
-        for (ndst = i = 0; i < mod->num_symtab; i++) {
+        *dst = *src;
-                if (i == 0 ||
+        for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
-                    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+                if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
-                        dst[ndst] = src[i];
+                        continue;
-                        dst[ndst++].st_name = s - mod->core_strtab;
+                dst[ndst] = *src;
-                        s += strlcpy(s, &mod->strtab[src[i].st_name],
+                dst[ndst].st_name = bitmap_weight(info->strmap,
-                                     KSYM_NAME_LEN) + 1;
+                                                  dst[ndst].st_name);
-                }
+                ++ndst;
        }
        mod->core_num_syms = ndst;
+        mod->core_strtab = s = mod->module_core + info->stroffs;
+        for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
+                if (test_bit(i, info->strmap))
+                        *++s = mod->strtab[i];
 }
 #else
 static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2374,7 +2283,7 @@ static void dynamic_debug_remove(struct _ddebug *debug)
 void * __weak module_alloc(unsigned long size)
 {
-        return vmalloc_exec(size);
+        return size == 0 ? NULL : vmalloc_exec(size);
 }
 static void *module_alloc_update_bounds(unsigned long size)
@@ -2420,136 +2329,48 @@ static inline void kmemleak_load_module(const struct module *mod,
 }
 #endif
-#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info)
-{
-        int err = -ENOKEY;
-        const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
-        const void *mod = info->hdr;
-        if (info->len > markerlen &&
-            memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
-                /* We truncate the module to discard the signature */
-                info->len -= markerlen;
-                err = mod_verify_sig(mod, &info->len);
-        }
-        if (!err) {
-                info->sig_ok = true;
-                return 0;
-        }
-        /* Not having a signature is only an error if we're strict. */
-        if (err < 0 && fips_enabled)
-                panic("Module verification failed with error %d in FIPS mode\n",
-                      err);
-        if (err == -ENOKEY && !sig_enforce)
-                err = 0;
-        return err;
-}
-#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info)
-{
-        return 0;
-}
-#endif /* !CONFIG_MODULE_SIG */
-/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
-static int elf_header_check(struct load_info *info)
-{
-        if (info->len < sizeof(*(info->hdr)))
-                return -ENOEXEC;
-        if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
-            || info->hdr->e_type != ET_REL
-            || !elf_check_arch(info->hdr)
-            || info->hdr->e_shentsize != sizeof(Elf_Shdr))
-                return -ENOEXEC;
-        if (info->hdr->e_shoff >= info->len
-            || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
-                info->len - info->hdr->e_shoff))
-                return -ENOEXEC;
-        return 0;
-}
 /* Sets info->hdr and info->len. */
-static int copy_module_from_user(const void __user *umod, unsigned long len,
+static int copy_and_check(struct load_info *info,
-                                  struct load_info *info)
+                          const void __user *umod, unsigned long len,
+                          const char __user *uargs)
 {
        int err;
+        Elf_Ehdr *hdr;
-        info->len = len;
+        if (len < sizeof(*hdr))
-        if (info->len < sizeof(*(info->hdr)))
                return -ENOEXEC;
-        err = security_kernel_module_from_file(NULL);
-        if (err)
-                return err;
        /* Suck in entire file: we'll want most of it. */
-        info->hdr = vmalloc(info->len);
+        /* vmalloc barfs on "unusual" numbers.  Check here */
-        if (!info->hdr)
+        if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
                return -ENOMEM;
-        if (copy_from_user(info->hdr, umod, info->len) != 0) {
+        if (copy_from_user(hdr, umod, len) != 0) {
-                vfree(info->hdr);
+                err = -EFAULT;
-                return -EFAULT;
+                goto free_hdr;
        }
-        return 0;
+        /* Sanity checks against insmoding binaries or wrong arch,
-}
+           weird elf version */
+        if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
-/* Sets info->hdr and info->len. */
+            || hdr->e_type != ET_REL
-static int copy_module_from_fd(int fd, struct load_info *info)
+            || !elf_check_arch(hdr)
-{
+            || hdr->e_shentsize != sizeof(Elf_Shdr)) {
-        struct file *file;
+                err = -ENOEXEC;
-        int err;
+                goto free_hdr;
-        struct kstat stat;
-        loff_t pos;
-        ssize_t bytes = 0;
-        file = fget(fd);
-        if (!file)
-                return -ENOEXEC;
-        err = security_kernel_module_from_file(file);
-        if (err)
-                goto out;
-        err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
-        if (err)
-                goto out;
-        if (stat.size > INT_MAX) {
-                err = -EFBIG;
-                goto out;
-        }
-        info->hdr = vmalloc(stat.size);
-        if (!info->hdr) {
-                err = -ENOMEM;
-                goto out;
        }
-        pos = 0;
+        if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
-        while (pos < stat.size) {
+                err = -ENOEXEC;
-                bytes = kernel_read(file, pos, (char *)(info->hdr) + pos,
+                goto free_hdr;
-                                    stat.size - pos);
-                if (bytes < 0) {
-                        vfree(info->hdr);
-                        err = bytes;
-                        goto out;
-                }
-                if (bytes == 0)
-                        break;
-                pos += bytes;
        }
-        info->len = pos;
-out:
+        info->hdr = hdr;
-        fput(file);
+        info->len = len;
+        return 0;
+free_hdr:
+        vfree(hdr);
        return err;
 }
@@ -2558,7 +2379,7 @@ static void free_copy(struct load_info *info)
        vfree(info->hdr);
 }
-static int rewrite_section_headers(struct load_info *info, int flags)
+static int rewrite_section_headers(struct load_info *info)
 {
        unsigned int i;
@@ -2586,10 +2407,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
        }
        /* Track but don't keep modinfo and version sections. */
-        if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
+        info->index.vers = find_sec(info, "__versions");
-                info->index.vers = 0; /* Pretend no __versions section! */
-        else
-                info->index.vers = find_sec(info, "__versions");
        info->index.info = find_sec(info, ".modinfo");
        info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
        info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -2604,7 +2422,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
 * Return the temporary module pointer (we'll replace it with the final
 * one when we move the module sections around).
 */
-static struct module *setup_load_info(struct load_info *info, int flags)
+static struct module *setup_load_info(struct load_info *info)
 {
        unsigned int i;
        int err;
@@ -2615,7 +2433,7 @@ static struct module *setup_load_info(struct load_info *info, int flags)
        info->secstrings = (void *)info->hdr
                + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
-        err = rewrite_section_headers(info, flags);
+        err = rewrite_section_headers(info);
        if (err)
                return ERR_PTR(err);
@@ -2653,14 +2471,11 @@ static struct module *setup_load_info(struct load_info *info, int flags)
        return mod;
 }
-static int check_modinfo(struct module *mod, struct load_info *info, int flags)
+static int check_modinfo(struct module *mod, struct load_info *info)
 {
        const char *modmagic = get_modinfo(info, "vermagic");
        int err;
-        if (flags & MODULE_INIT_IGNORE_VERMAGIC)
-                modmagic = NULL;
        /* This is allowed: modprobe --force will invalidate it. */
        if (!modmagic) {
                err = try_to_force_load(mod, "bad vermagic");
@@ -2672,9 +2487,6 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
                return -ENOEXEC;
        }
-        if (!get_modinfo(info, "intree"))
-                add_taint_module(mod, TAINT_OOT_MODULE);
        if (get_modinfo(info, "staging")) {
                add_taint_module(mod, TAINT_CRAP);
                printk(KERN_WARNING "%s: module is from the staging directory,"
@@ -2716,7 +2528,7 @@ static void find_module_sections(struct module *mod, struct load_info *info)
        mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
 #endif
 #ifdef CONFIG_CONSTRUCTORS
-        mod->ctors = section_objs(info, ".ctors",
+        mod->ctors = section_objs(info, CONFIG_GCOV_CTORS,
                                  sizeof(*mod->ctors), &mod->num_ctors);
 #endif
@@ -2790,26 +2602,23 @@ static int move_module(struct module *mod, struct load_info *info)
        memset(ptr, 0, mod->core_size);
        mod->module_core = ptr;
-        if (mod->init_size) {
+        ptr = module_alloc_update_bounds(mod->init_size);
-                ptr = module_alloc_update_bounds(mod->init_size);
+        /*
-                /*
+         * The pointer to this block is stored in the module structure
-                 * The pointer to this block is stored in the module structure
+         * which is inside the block. This block doesn't need to be
-                 * which is inside the block. This block doesn't need to be
+         * scanned as it contains data and code that will be freed
-                 * scanned as it contains data and code that will be freed
+         * after the module is initialized.
-                 * after the module is initialized.
+         */
-                 */
+        kmemleak_ignore(ptr);
-                kmemleak_ignore(ptr);
+        if (!ptr && mod->init_size) {
-                if (!ptr) {
+                module_free(mod, mod->module_core);
-                        module_free(mod, mod->module_core);
+                return -ENOMEM;
-                        return -ENOMEM;
+        }
-                }
+        memset(ptr, 0, mod->init_size);
-                memset(ptr, 0, mod->init_size);
+        mod->module_init = ptr;
-                mod->module_init = ptr;
-        } else
-                mod->module_init = NULL;
        /* Transfer each section which specifies SHF_ALLOC */
-        pr_debug("final section addresses:\n");
+        DEBUGP("final section addresses:\n");
        for (i = 0; i < info->hdr->e_shnum; i++) {
                void *dest;
                Elf_Shdr *shdr = &info->sechdrs[i];
@@ -2827,8 +2636,8 @@ static int move_module(struct module *mod, struct load_info *info)
                        memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
                /* Update sh_addr to point to copy in image. */
                shdr->sh_addr = (unsigned long)dest;
-                pr_debug("\t0x%lx %s\n",
+                DEBUGP("\t0x%lx %s\n",
-                         (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
+                       shdr->sh_addr, info->secstrings + shdr->sh_name);
        }
        return 0;
@@ -2848,10 +2657,6 @@ static int check_module_license_and_versions(struct module *mod)
        if (strcmp(mod->name, "driverloader") == 0)
                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
-        /* lve claims to be GPL but upstream won't provide source */
-        if (strcmp(mod->name, "lve") == 0)
-                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 #ifdef CONFIG_MODVERSIONS
        if ((mod->num_syms && !mod->crcs)
            || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2899,18 +2704,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
        return 0;
 }
-static struct module *layout_and_allocate(struct load_info *info, int flags)
+static struct module *layout_and_allocate(struct load_info *info)
 {
        /* Module within temporary copy. */
        struct module *mod;
        Elf_Shdr *pcpusec;
        int err;
-        mod = setup_load_info(info, flags);
+        mod = setup_load_info(info);
        if (IS_ERR(mod))
                return mod;
-        err = check_modinfo(mod, info, flags);
+        err = check_modinfo(mod, info);
        if (err)
                return ERR_PTR(err);
@@ -2934,18 +2739,27 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
           this is done generically; there doesn't appear to be any
           special cases for the architectures. */
        layout_sections(mod, info);
+        info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
+                         * sizeof(long), GFP_KERNEL);
+        if (!info->strmap) {
+                err = -ENOMEM;
+                goto free_percpu;
+        }
        layout_symtab(mod, info);
        /* Allocate and move to the final place */
        err = move_module(mod, info);
        if (err)
-                goto free_percpu;
+                goto free_strmap;
        /* Module has been copied to its final place now: return it. */
        mod = (void *)info->sechdrs[info->index.mod].sh_addr;
        kmemleak_load_module(mod, info);
        return mod;
+free_strmap:
+        kfree(info->strmap);
 free_percpu:
        percpu_modfree(mod);
 out:
@@ -2955,6 +2769,7 @@ out:
 /* mod is no longer valid after this! */
 static void module_deallocate(struct module *mod, struct load_info *info)
 {
+        kfree(info->strmap);
        percpu_modfree(mod);
        module_free(mod, mod->module_init);
        module_free(mod, mod->module_core);
@@ -2983,142 +2798,31 @@ static int post_relocation(struct module *mod, const struct load_info *info)
        return module_finalize(info->hdr, info->sechdrs, mod);
 }
-/* Is this module of this name done loading?  No locks held. */
-static bool finished_loading(const char *name)
-{
-        struct module *mod;
-        bool ret;
-        mutex_lock(&module_mutex);
-        mod = find_module(name);
-        ret = !mod || mod->state != MODULE_STATE_COMING;
-        mutex_unlock(&module_mutex);
-        return ret;
-}
-/* Call module constructors. */
-static void do_mod_ctors(struct module *mod)
-{
-#ifdef CONFIG_CONSTRUCTORS
-        unsigned long i;
-        for (i = 0; i < mod->num_ctors; i++)
-                mod->ctors[i]();
-#endif
-}
-/* This is where the real work happens */
-static int do_init_module(struct module *mod)
-{
-        int ret = 0;
-        blocking_notifier_call_chain(&module_notify_list,
-                        MODULE_STATE_COMING, mod);
-        /* Set RO and NX regions for core */
-        set_section_ro_nx(mod->module_core,
-                                mod->core_text_size,
-                                mod->core_ro_size,
-                                mod->core_size);
-        /* Set RO and NX regions for init */
-        set_section_ro_nx(mod->module_init,
-                                mod->init_text_size,
-                                mod->init_ro_size,
-                                mod->init_size);
-        do_mod_ctors(mod);
-        /* Start the module */
-        if (mod->init != NULL)
-                ret = do_one_initcall(mod->init);
-        if (ret < 0) {
-                /* Init routine failed: abort.  Try to protect us from
-                   buggy refcounters. */
-                mod->state = MODULE_STATE_GOING;
-                synchronize_sched();
-                module_put(mod);
-                blocking_notifier_call_chain(&module_notify_list,
-                                             MODULE_STATE_GOING, mod);
-                free_module(mod);
-                wake_up_all(&module_wq);
-                return ret;
-        }
-        if (ret > 0) {
-                printk(KERN_WARNING
-"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
-"%s: loading module anyway...\n",
-                       __func__, mod->name, ret,
-                       __func__);
-                dump_stack();
-        }
-        /* Now it's a first class citizen! */
-        mod->state = MODULE_STATE_LIVE;
-        blocking_notifier_call_chain(&module_notify_list,
-                                     MODULE_STATE_LIVE, mod);
-        /* We need to finish all async code before the module init sequence is done */
-        async_synchronize_full();
-        mutex_lock(&module_mutex);
-        /* Drop initial reference. */
-        module_put(mod);
-        trim_init_extable(mod);
-#ifdef CONFIG_KALLSYMS
-        mod->num_symtab = mod->core_num_syms;
-        mod->symtab = mod->core_symtab;
-        mod->strtab = mod->core_strtab;
-#endif
-        unset_module_init_ro_nx(mod);
-        module_free(mod, mod->module_init);
-        mod->module_init = NULL;
-        mod->init_size = 0;
-        mod->init_ro_size = 0;
-        mod->init_text_size = 0;
-        mutex_unlock(&module_mutex);
-        wake_up_all(&module_wq);
-        return 0;
-}
-static int may_init_module(void)
-{
-        if (!capable(CAP_SYS_MODULE) || modules_disabled)
-                return -EPERM;
-        return 0;
-}
 /* Allocate and load the module: note that size of section 0 is always
   zero, and we rely on this for optional sections. */
-static int load_module(struct load_info *info, const char __user *uargs,
+static struct module *load_module(void __user *umod,
-                       int flags)
+                                  unsigned long len,
+                                  const char __user *uargs)
 {
-        struct module *mod, *old;
+        struct load_info info = { NULL, };
+        struct module *mod;
        long err;
-        err = module_sig_check(info);
+        DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
-        if (err)
+               umod, len, uargs);
-                goto free_copy;
-        err = elf_header_check(info);
+        /* Copy in the blobs from userspace, check they are vaguely sane. */
+        err = copy_and_check(&info, umod, len, uargs);
        if (err)
-                goto free_copy;
+                return ERR_PTR(err);
        /* Figure out module layout, and allocate all the memory. */
-        mod = layout_and_allocate(info, flags);
+        mod = layout_and_allocate(&info);
        if (IS_ERR(mod)) {
                err = PTR_ERR(mod);
                goto free_copy;
        }
-#ifdef CONFIG_MODULE_SIG
-        mod->sig_ok = info->sig_ok;
-        if (!mod->sig_ok)
-                add_taint_module(mod, TAINT_FORCED_MODULE);
-#endif
        /* Now module is in final location, initialize linked lists, etc. */
        err = module_unload_init(mod);
        if (err)
@@ -3126,25 +2830,25 @@ static int load_module(struct load_info *info, const char __user *uargs,
        /* Now we've got everything in the final locations, we can
         * find optional sections. */
-        find_module_sections(mod, info);
+        find_module_sections(mod, &info);
        err = check_module_license_and_versions(mod);
        if (err)
                goto free_unload;
        /* Set up MODINFO_ATTR fields */
-        setup_modinfo(mod, info);
+        setup_modinfo(mod, &info);
        /* Fix up syms, so that st_value is a pointer to location. */
-        err = simplify_symbols(mod, info);
+        err = simplify_symbols(mod, &info);
        if (err < 0)
                goto free_modinfo;
-        err = apply_relocations(mod, info);
+        err = apply_relocations(mod, &info);
        if (err < 0)
                goto free_modinfo;
-        err = post_relocation(mod, info);
+        err = post_relocation(mod, &info);
        if (err < 0)
                goto free_modinfo;
@@ -3167,61 +2871,52 @@ static int load_module(struct load_info *info, const char __user *uargs,
         * function to insert in a way safe to concurrent readers.
         * The mutex protects against concurrent writers.
         */
-again:
        mutex_lock(&module_mutex);
-        if ((old = find_module(mod->name)) != NULL) {
+        if (find_module(mod->name)) {
-                if (old->state == MODULE_STATE_COMING) {
-                        /* Wait in case it fails to load. */
-                        mutex_unlock(&module_mutex);
-                        err = wait_event_interruptible(module_wq,
-                                               finished_loading(mod->name));
-                        if (err)
-                                goto free_arch_cleanup;
-                        goto again;
-                }
                err = -EEXIST;
                goto unlock;
        }
        /* This has to be done once we're sure module name is unique. */
-        dynamic_debug_setup(info->debug, info->num_debug);
+        if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
+                dynamic_debug_setup(info.debug, info.num_debug);
        /* Find duplicate symbols */
        err = verify_export_symbols(mod);
        if (err < 0)
                goto ddebug;
-        module_bug_finalize(info->hdr, info->sechdrs, mod);
+        module_bug_finalize(info.hdr, info.sechdrs, mod);
        list_add_rcu(&mod->list, &modules);
        mutex_unlock(&module_mutex);
        /* Module is ready to execute: parsing args may do that. */
-        err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+        err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
-                         -32768, 32767, &ddebug_dyndbg_module_param_cb);
        if (err < 0)
                goto unlink;
        /* Link in to syfs. */
-        err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
+        err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
        if (err < 0)
                goto unlink;
-        /* Get rid of temporary copy. */
+        /* Get rid of temporary copy and strmap. */
-        free_copy(info);
+        kfree(info.strmap);
+        free_copy(&info);
        /* Done! */
        trace_module_load(mod);
+        return mod;
-        return do_init_module(mod);
 unlink:
        mutex_lock(&module_mutex);
        /* Unlink carefully: kallsyms could be walking list. */
        list_del_rcu(&mod->list);
        module_bug_cleanup(mod);
-        wake_up_all(&module_wq);
 ddebug:
-        dynamic_debug_remove(info->debug);
+        if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
+                dynamic_debug_remove(info.debug);
 unlock:
        mutex_unlock(&module_mutex);
        synchronize_sched();
@@ -3233,52 +2928,106 @@ again:
 free_unload:
        module_unload_free(mod);
 free_module:
-        module_deallocate(mod, info);
+        module_deallocate(mod, &info);
 free_copy:
-        free_copy(info);
+        free_copy(&info);
-        return err;
+        return ERR_PTR(err);
 }
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+        unsigned long i;
+        for (i = 0; i < mod->num_ctors; i++)
+                mod->ctors[i]();
+#endif
+}
+/* This is where the real work happens */
 SYSCALL_DEFINE3(init_module, void __user *, umod,
                unsigned long, len, const char __user *, uargs)
 {
-        int err;
+        struct module *mod;
-        struct load_info info = { };
+        int ret = 0;
-        err = may_init_module();
+        /* Must have permission */
-        if (err)
+        if (!capable(CAP_SYS_MODULE) || modules_disabled)
-                return err;
+                return -EPERM;
-        pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
+        /* Do all the hard work */
-               umod, len, uargs);
+        mod = load_module(umod, len, uargs);
+        if (IS_ERR(mod))
+                return PTR_ERR(mod);
-        err = copy_module_from_user(umod, len, &info);
+        blocking_notifier_call_chain(&module_notify_list,
-        if (err)
+                        MODULE_STATE_COMING, mod);
-                return err;
-        return load_module(&info, uargs, 0);
+        /* Set RO and NX regions for core */
-}
+        set_section_ro_nx(mod->module_core,
+                                mod->core_text_size,
+                                mod->core_ro_size,
+                                mod->core_size);
-SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+        /* Set RO and NX regions for init */
-{
+        set_section_ro_nx(mod->module_init,
-        int err;
+                                mod->init_text_size,
-        struct load_info info = { };
+                                mod->init_ro_size,
+                                mod->init_size);
-        err = may_init_module();
+        do_mod_ctors(mod);
-        if (err)
+        /* Start the module */
-                return err;
+        if (mod->init != NULL)
+                ret = do_one_initcall(mod->init);
+        if (ret < 0) {
+                /* Init routine failed: abort.  Try to protect us from
+                   buggy refcounters. */
+                mod->state = MODULE_STATE_GOING;
+                synchronize_sched();
+                module_put(mod);
+                blocking_notifier_call_chain(&module_notify_list,
+                                             MODULE_STATE_GOING, mod);
+                free_module(mod);
+                wake_up(&module_wq);
+                return ret;
+        }
+        if (ret > 0) {
+                printk(KERN_WARNING
+"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
+"%s: loading module anyway...\n",
+                       __func__, mod->name, ret,
+                       __func__);
+                dump_stack();
+        }
-        pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
+        /* Now it's a first class citizen!  Wake up anyone waiting for it. */
+        mod->state = MODULE_STATE_LIVE;
+        wake_up(&module_wq);
+        blocking_notifier_call_chain(&module_notify_list,
+                                     MODULE_STATE_LIVE, mod);
-        if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
+        /* We need to finish all async code before the module init sequence is done */
-                      |MODULE_INIT_IGNORE_VERMAGIC))
+        async_synchronize_full();
-                return -EINVAL;
-        err = copy_module_from_fd(fd, &info);
+        mutex_lock(&module_mutex);
-        if (err)
+        /* Drop initial reference. */
-                return err;
+        module_put(mod);
+        trim_init_extable(mod);
+#ifdef CONFIG_KALLSYMS
+        mod->num_symtab = mod->core_num_syms;
+        mod->symtab = mod->core_symtab;
+        mod->strtab = mod->core_strtab;
+#endif
+        unset_module_init_ro_nx(mod);
+        module_free(mod, mod->module_init);
+        mod->module_init = NULL;
+        mod->init_size = 0;
+        mod->init_ro_size = 0;
+        mod->init_text_size = 0;
+        mutex_unlock(&module_mutex);
-        return load_module(&info, uargs, flags);
+        return 0;
 }
 static inline int within(unsigned long addr, void *start, unsigned long size)
@@ -3506,7 +3255,18 @@ static char *module_flags(struct module *mod, char *buf)
            mod->state == MODULE_STATE_GOING ||
            mod->state == MODULE_STATE_COMING) {
                buf[bx++] = '(';
-                bx += module_flags_taint(mod, buf + bx);
+                if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
+                        buf[bx++] = 'P';
+                if (mod->taints & (1 << TAINT_FORCED_MODULE))
+                        buf[bx++] = 'F';
+                if (mod->taints & (1 << TAINT_CRAP))
+                        buf[bx++] = 'C';
+                /*
+                 * TAINT_FORCED_RMMOD: could be added.
+                 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+                 * apply to modules.
+                 */
                /* Show a - for module-is-being-unloaded */
                if (mod->state == MODULE_STATE_GOING)
                        buf[bx++] = '-';
@@ -3727,3 +3487,50 @@ void module_layout(struct module *mod,
 }
 EXPORT_SYMBOL(module_layout);
 #endif
+#ifdef CONFIG_TRACEPOINTS
+void module_update_tracepoints(void)
+{
+        struct module *mod;
+        mutex_lock(&module_mutex);
+        list_for_each_entry(mod, &modules, list)
+                if (!mod->taints)
+                        tracepoint_update_probe_range(mod->tracepoints_ptrs,
+                                mod->tracepoints_ptrs + mod->num_tracepoints);
+        mutex_unlock(&module_mutex);
+}
+/*
+ * Returns 0 if current not found.
+ * Returns 1 if current found.
+ */
+int module_get_iter_tracepoints(struct tracepoint_iter *iter)
+{
+        struct module *iter_mod;
+        int found = 0;
+        mutex_lock(&module_mutex);
+        list_for_each_entry(iter_mod, &modules, list) {
+                if (!iter_mod->taints) {
+                        /*
+                         * Sorted module list
+                         */
+                        if (iter_mod < iter->module)
+                                continue;
+                        else if (iter_mod > iter->module)
+                                iter->tracepoint = NULL;
+                        found = tracepoint_get_iter_range(&iter->tracepoint,
+                                iter_mod->tracepoints_ptrs,
+                                iter_mod->tracepoints_ptrs
+                                        + iter_mod->num_tracepoints);
+                        if (found) {
+                                iter->module = iter_mod;
+                                break;
+                        }
+                }
+        }
+        mutex_unlock(&module_mutex);
+        return found;
+}
+#endif
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
deleted file mode 100644
index f2970bddc5e..00000000000
--- a/kernel/module_signing.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/* Module signature checker
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/err.h>
-#include <crypto/public_key.h>
-#include <crypto/hash.h>
-#include <keys/asymmetric-type.h>
-#include "module-internal.h"
-/*
- * Module signature information block.
- *
- * The constituents of the signature section are, in order:
- *
- *      - Signer's name
- *      - Key identifier
- *      - Signature data
- *      - Information block
- */
-struct module_signature {
-        u8      algo;           /* Public-key crypto algorithm [enum pkey_algo] */
-        u8      hash;           /* Digest algorithm [enum pkey_hash_algo] */
-        u8      id_type;        /* Key identifier type [enum pkey_id_type] */
-        u8      signer_len;     /* Length of signer's name */
-        u8      key_id_len;     /* Length of key identifier */
-        u8      __pad[3];
-        __be32  sig_len;        /* Length of signature data */
-};
-/*
- * Digest the module contents.
- */
-static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
-                                                    const void *mod,
-                                                    unsigned long modlen)
-{
-        struct public_key_signature *pks;
-        struct crypto_shash *tfm;
-        struct shash_desc *desc;
-        size_t digest_size, desc_size;
-        int ret;
-        pr_devel("==>%s()\n", __func__);
-        
-        /* Allocate the hashing algorithm we're going to need and find out how
-         * big the hash operational data will be.
-         */
-        tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0);
-        if (IS_ERR(tfm))
-                return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
-        desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
-        digest_size = crypto_shash_digestsize(tfm);
-        /* We allocate the hash operational data storage on the end of our
-         * context data and the digest output buffer on the end of that.
-         */
-        ret = -ENOMEM;
-        pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
-        if (!pks)
-                goto error_no_pks;
-        pks->pkey_hash_algo     = hash;
-        pks->digest             = (u8 *)pks + sizeof(*pks) + desc_size;
-        pks->digest_size        = digest_size;
-        desc = (void *)pks + sizeof(*pks);
-        desc->tfm   = tfm;
-        desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-        ret = crypto_shash_init(desc);
-        if (ret < 0)
-                goto error;
-        ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
-        if (ret < 0)
-                goto error;
-        crypto_free_shash(tfm);
-        pr_devel("<==%s() = ok\n", __func__);
-        return pks;
-error:
-        kfree(pks);
-error_no_pks:
-        crypto_free_shash(tfm);
-        pr_devel("<==%s() = %d\n", __func__, ret);
-        return ERR_PTR(ret);
-}
-/*
- * Extract an MPI array from the signature data.  This represents the actual
- * signature.  Each raw MPI is prefaced by a BE 2-byte value indicating the
- * size of the MPI in bytes.
- *
- * RSA signatures only have one MPI, so currently we only read one.
- */
-static int mod_extract_mpi_array(struct public_key_signature *pks,
-                                 const void *data, size_t len)
-{
-        size_t nbytes;
-        MPI mpi;
-        if (len < 3)
-                return -EBADMSG;
-        nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
-        data += 2;
-        len -= 2;
-        if (len != nbytes)
-                return -EBADMSG;
-        mpi = mpi_read_raw_data(data, nbytes);
-        if (!mpi)
-                return -ENOMEM;
-        pks->mpi[0] = mpi;
-        pks->nr_mpi = 1;
-        return 0;
-}
-/*
- * Request an asymmetric key.
- */
-static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
-                                          const u8 *key_id, size_t key_id_len)
-{
-        key_ref_t key;
-        size_t i;
-        char *id, *q;
-        pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
-        /* Construct an identifier. */
-        id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
-        if (!id)
-                return ERR_PTR(-ENOKEY);
-        memcpy(id, signer, signer_len);
-        q = id + signer_len;
-        *q++ = ':';
-        *q++ = ' ';
-        for (i = 0; i < key_id_len; i++) {
-                *q++ = hex_asc[*key_id >> 4];
-                *q++ = hex_asc[*key_id++ & 0x0f];
-        }
-        *q = 0;
-        pr_debug("Look up: \"%s\"\n", id);
-        key = keyring_search(make_key_ref(modsign_keyring, 1),
-                             &key_type_asymmetric, id);
-        if (IS_ERR(key))
-                pr_warn("Request for unknown module key '%s' err %ld\n",
-                        id, PTR_ERR(key));
-        kfree(id);
-        if (IS_ERR(key)) {
-                switch (PTR_ERR(key)) {
-                        /* Hide some search errors */
-                case -EACCES:
-                case -ENOTDIR:
-                case -EAGAIN:
-                        return ERR_PTR(-ENOKEY);
-                default:
-                        return ERR_CAST(key);
-                }
-        }
-        pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
-        return key_ref_to_ptr(key);
-}
-/*
- * Verify the signature on a module.
- */
-int mod_verify_sig(const void *mod, unsigned long *_modlen)
-{
-        struct public_key_signature *pks;
-        struct module_signature ms;
-        struct key *key;
-        const void *sig;
-        size_t modlen = *_modlen, sig_len;
-        int ret;
-        pr_devel("==>%s(,%zu)\n", __func__, modlen);
-        if (modlen <= sizeof(ms))
-                return -EBADMSG;
-        memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
-        modlen -= sizeof(ms);
-        sig_len = be32_to_cpu(ms.sig_len);
-        if (sig_len >= modlen)
-                return -EBADMSG;
-        modlen -= sig_len;
-        if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
-                return -EBADMSG;
-        modlen -= (size_t)ms.signer_len + ms.key_id_len;
-        *_modlen = modlen;
-        sig = mod + modlen;
-        /* For the moment, only support RSA and X.509 identifiers */
-        if (ms.algo != PKEY_ALGO_RSA ||
-            ms.id_type != PKEY_ID_X509)
-                return -ENOPKG;
-        if (ms.hash >= PKEY_HASH__LAST ||
-            !pkey_hash_algo[ms.hash])
-                return -ENOPKG;
-        key = request_asymmetric_key(sig, ms.signer_len,
-                                     sig + ms.signer_len, ms.key_id_len);
-        if (IS_ERR(key))
-                return PTR_ERR(key);
-        pks = mod_make_digest(ms.hash, mod, modlen);
-        if (IS_ERR(pks)) {
-                ret = PTR_ERR(pks);
-                goto error_put_key;
-        }
-        ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
-                                    sig_len);
-        if (ret < 0)
-                goto error_free_pks;
-        ret = verify_signature(key, pks);
-        pr_devel("verify_signature() = %d\n", ret);
-error_free_pks:
-        mpi_free(pks->rsa.s);
-        kfree(pks);
-error_put_key:
-        key_put(key);
-        pr_devel("<==%s() = %d\n", __func__, ret);
-        return ret;     
-}
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 7e3443fe1f4..73da83aff41 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -14,7 +14,7 @@
 */
 #include <linux/mutex.h>
 #include <linux/delay.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/poison.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a307cc9c952..d607ed5dd44 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,7 +19,7 @@
 */
 #include <linux/mutex.h>
 #include <linux/sched.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
@@ -240,7 +240,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                /* didn't get the lock, go to sleep: */
                spin_unlock_mutex(&lock->wait_lock, flags);
-                schedule_preempt_disabled();
+                preempt_enable_no_resched();
+                schedule();
+                preempt_disable();
                spin_lock_mutex(&lock->wait_lock, flags);
        }
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2d5cc4ccff7..8d7b435806c 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -1,6 +1,6 @@
 #include <linux/kdebug.h>
 #include <linux/kprobes.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/vmalloc.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 78e2ecb2016..9aeab4b98c6 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,7 @@
 */
 #include <linux/slab.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
 #include <linux/mnt_namespace.h>
@@ -57,8 +57,7 @@ static inline struct nsproxy *create_nsproxy(void)
 * leave it to the caller to do proper locking and attach it to task.
 */
 static struct nsproxy *create_new_namespaces(unsigned long flags,
-        struct task_struct *tsk, struct user_namespace *user_ns,
+                        struct task_struct *tsk, struct fs_struct *new_fs)
-        struct fs_struct *new_fs)
 {
        struct nsproxy *new_nsp;
        int err;
@@ -67,31 +66,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
        if (!new_nsp)
                return ERR_PTR(-ENOMEM);
-        new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
+        new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
        if (IS_ERR(new_nsp->mnt_ns)) {
                err = PTR_ERR(new_nsp->mnt_ns);
                goto out_ns;
        }
-        new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
+        new_nsp->uts_ns = copy_utsname(flags, tsk);
        if (IS_ERR(new_nsp->uts_ns)) {
                err = PTR_ERR(new_nsp->uts_ns);
                goto out_uts;
        }
-        new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
+        new_nsp->ipc_ns = copy_ipcs(flags, tsk);
        if (IS_ERR(new_nsp->ipc_ns)) {
                err = PTR_ERR(new_nsp->ipc_ns);
                goto out_ipc;
        }
-        new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
+        new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
        if (IS_ERR(new_nsp->pid_ns)) {
                err = PTR_ERR(new_nsp->pid_ns);
                goto out_pid;
        }
-        new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
+        new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
        if (IS_ERR(new_nsp->net_ns)) {
                err = PTR_ERR(new_nsp->net_ns);
                goto out_net;
@@ -123,7 +122,6 @@ out_ns:
 int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 {
        struct nsproxy *old_ns = tsk->nsproxy;
-        struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
        struct nsproxy *new_ns;
        int err = 0;
@@ -136,7 +134,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
                                CLONE_NEWPID | CLONE_NEWNET)))
                return 0;
-        if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
+        if (!capable(CAP_SYS_ADMIN)) {
                err = -EPERM;
                goto out;
        }
@@ -153,8 +151,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
                goto out;
        }
-        new_ns = create_new_namespaces(flags, tsk,
+        new_ns = create_new_namespaces(flags, tsk, tsk->fs);
-                                       task_cred_xxx(tsk, user_ns), tsk->fs);
        if (IS_ERR(new_ns)) {
                err = PTR_ERR(new_ns);
                goto out;
@@ -186,21 +183,19 @@ void free_nsproxy(struct nsproxy *ns)
 * On success, returns the new nsproxy.
 */
 int unshare_nsproxy_namespaces(unsigned long unshare_flags,
-        struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
+                struct nsproxy **new_nsp, struct fs_struct *new_fs)
 {
-        struct user_namespace *user_ns;
        int err = 0;
        if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-                               CLONE_NEWNET | CLONE_NEWPID)))
+                               CLONE_NEWNET)))
                return 0;
-        user_ns = new_cred ? new_cred->user_ns : current_user_ns();
+        if (!capable(CAP_SYS_ADMIN))
-        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
-        *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
+        *new_nsp = create_new_namespaces(unshare_flags, current,
-                                         new_fs ? new_fs : current->fs);
+                                new_fs ? new_fs : current->fs);
        if (IS_ERR(*new_nsp)) {
                err = PTR_ERR(*new_nsp);
                goto out;
@@ -246,6 +241,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
        struct file *file;
        int err;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
        file = proc_ns_fget(fd);
        if (IS_ERR(file))
                return PTR_ERR(file);
@@ -256,7 +254,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
        if (nstype && (ops->type != nstype))
                goto out;
-        new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
+        new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
        if (IS_ERR(new_nsproxy)) {
                err = PTR_ERR(new_nsproxy);
                goto out;
diff --git a/kernel/padata.c b/kernel/padata.c
index 072f4ee4eb8..b91941df5e6 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1,8 +1,6 @@
 /*
 * padata.c - generic interface to process data streams in parallel
 *
- * See Documentation/padata.txt for an api documentation.
- *
 * Copyright (C) 2008, 2009 secunet Security Networks AG
 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
 *
@@ -20,7 +18,7 @@
 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/cpumask.h>
 #include <linux/err.h>
 #include <linux/cpu.h>
@@ -31,6 +29,7 @@
 #include <linux/sysfs.h>
 #include <linux/rcupdate.h>
+#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
 #define MAX_OBJ_NUM 1000
 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
@@ -44,19 +43,18 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
        return target_cpu;
 }
-static int padata_cpu_hash(struct parallel_data *pd)
+static int padata_cpu_hash(struct padata_priv *padata)
 {
        int cpu_index;
+        struct parallel_data *pd;
+        pd =  padata->pd;
        /*
         * Hash the sequence numbers to the cpus by taking
         * seq_nr mod. number of cpus in use.
         */
+        cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
-        spin_lock(&pd->seq_lock);
-        cpu_index =  pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
-        pd->seq_nr++;
-        spin_unlock(&pd->seq_lock);
        return padata_index_to_cpu(pd, cpu_index);
 }
@@ -134,7 +132,12 @@ int padata_do_parallel(struct padata_instance *pinst,
        padata->pd = pd;
        padata->cb_cpu = cb_cpu;
-        target_cpu = padata_cpu_hash(pd);
+        if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
+                atomic_set(&pd->seq_nr, -1);
+        padata->seq_nr = atomic_inc_return(&pd->seq_nr);
+        target_cpu = padata_cpu_hash(padata);
        queue = per_cpu_ptr(pd->pqueue, target_cpu);
        spin_lock(&queue->parallel.lock);
@@ -170,8 +173,8 @@ EXPORT_SYMBOL(padata_do_parallel);
 static struct padata_priv *padata_get_next(struct parallel_data *pd)
 {
        int cpu, num_cpus;
-        unsigned int next_nr, next_index;
+        int next_nr, next_index;
-        struct padata_parallel_queue *next_queue;
+        struct padata_parallel_queue *queue, *next_queue;
        struct padata_priv *padata;
        struct padata_list *reorder;
@@ -186,6 +189,14 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
        cpu = padata_index_to_cpu(pd, next_index);
        next_queue = per_cpu_ptr(pd->pqueue, cpu);
+        if (unlikely(next_nr > pd->max_seq_nr)) {
+                next_nr = next_nr - pd->max_seq_nr - 1;
+                next_index = next_nr % num_cpus;
+                cpu = padata_index_to_cpu(pd, next_index);
+                next_queue = per_cpu_ptr(pd->pqueue, cpu);
+                pd->processed = 0;
+        }
        padata = NULL;
        reorder = &next_queue->reorder;
@@ -194,6 +205,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
                padata = list_entry(reorder->list.next,
                                    struct padata_priv, list);
+                BUG_ON(next_nr != padata->seq_nr);
                spin_lock(&reorder->lock);
                list_del_init(&padata->list);
                atomic_dec(&pd->reorder_objects);
@@ -204,7 +217,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
                goto out;
        }
-        if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
+        queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
+        if (queue->cpu_index == next_queue->cpu_index) {
                padata = ERR_PTR(-ENODATA);
                goto out;
        }
@@ -216,7 +230,6 @@ out:
 static void padata_reorder(struct parallel_data *pd)
 {
-        int cb_cpu;
        struct padata_priv *padata;
        struct padata_serial_queue *squeue;
        struct padata_instance *pinst = pd->pinst;
@@ -257,14 +270,13 @@ static void padata_reorder(struct parallel_data *pd)
                        return;
                }
-                cb_cpu = padata->cb_cpu;
+                squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
-                squeue = per_cpu_ptr(pd->squeue, cb_cpu);
                spin_lock(&squeue->serial.lock);
                list_add_tail(&padata->list, &squeue->serial.list);
                spin_unlock(&squeue->serial.lock);
-                queue_work_on(cb_cpu, pinst->wq, &squeue->work);
+                queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
        }
        spin_unlock_bh(&pd->lock);
@@ -355,13 +367,13 @@ static int padata_setup_cpumasks(struct parallel_data *pd,
        if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
                return -ENOMEM;
-        cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
+        cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
        if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
                free_cpumask_var(pd->cpumask.cbcpu);
                return -ENOMEM;
        }
-        cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask);
+        cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
        return 0;
 }
@@ -388,7 +400,7 @@ static void padata_init_squeues(struct parallel_data *pd)
 /* Initialize all percpu queues used by parallel workers */
 static void padata_init_pqueues(struct parallel_data *pd)
 {
-        int cpu_index, cpu;
+        int cpu_index, num_cpus, cpu;
        struct padata_parallel_queue *pqueue;
        cpu_index = 0;
@@ -403,6 +415,9 @@ static void padata_init_pqueues(struct parallel_data *pd)
                INIT_WORK(&pqueue->work, padata_parallel_worker);
                atomic_set(&pqueue->num_obj, 0);
        }
+        num_cpus = cpumask_weight(pd->cpumask.pcpu);
+        pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
 }
 /* Allocate and initialize the internal cpumask dependend resources. */
@@ -429,7 +444,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
        padata_init_pqueues(pd);
        padata_init_squeues(pd);
        setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
-        pd->seq_nr = 0;
+        atomic_set(&pd->seq_nr, -1);
        atomic_set(&pd->reorder_objects, 0);
        atomic_set(&pd->refcnt, 0);
        pd->pinst = pinst;
@@ -565,7 +580,7 @@ EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
 static bool padata_validate_cpumask(struct padata_instance *pinst,
                                    const struct cpumask *cpumask)
 {
-        if (!cpumask_intersects(cpumask, cpu_online_mask)) {
+        if (!cpumask_intersects(cpumask, cpu_active_mask)) {
                pinst->flags |= PADATA_INVALID;
                return false;
        }
@@ -679,7 +694,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
 {
        struct parallel_data *pd;
-        if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+        if (cpumask_test_cpu(cpu, cpu_active_mask)) {
                pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
                                     pinst->cpumask.cbcpu);
                if (!pd)
@@ -747,9 +762,6 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
                        return -ENOMEM;
                padata_replace(pinst, pd);
-                cpumask_clear_cpu(cpu, pd->cpumask.cbcpu);
-                cpumask_clear_cpu(cpu, pd->cpumask.pcpu);
        }
        return 0;
diff --git a/kernel/panic.c b/kernel/panic.c
index e1b2822fff9..41fc78ea3db 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,13 +27,19 @@
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
-int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
+/* Machine specific panic information string */
+char *mach_panic_string;
+int panic_on_oops;
 static unsigned long tainted_mask;
 static int pause_on_oops;
 static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
-int panic_timeout;
+#ifndef CONFIG_PANIC_TIMEOUT
+#define CONFIG_PANIC_TIMEOUT 0
+#endif
+int panic_timeout = CONFIG_PANIC_TIMEOUT;
 EXPORT_SYMBOL_GPL(panic_timeout);
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -49,15 +55,6 @@ static long no_blink(int state)
 long (*panic_blink)(int state);
 EXPORT_SYMBOL(panic_blink);
-/*
- * Stop ourself in panic -- architecture code may override this
- */
-void __weak panic_smp_self_stop(void)
-{
-        while (1)
-                cpu_relax();
-}
 /**
 *      panic - halt the system
 *      @fmt: The text string to print
@@ -66,34 +63,19 @@ void __weak panic_smp_self_stop(void)
 *
 *      This function never returns.
 */
-void panic(const char *fmt, ...)
+NORET_TYPE void panic(const char * fmt, ...)
 {
-        static DEFINE_SPINLOCK(panic_lock);
        static char buf[1024];
        va_list args;
        long i, i_next = 0;
        int state = 0;
        /*
-         * Disable local interrupts. This will prevent panic_smp_self_stop
-         * from deadlocking the first cpu that invokes the panic, since
-         * there is nothing to prevent an interrupt handler (that runs
-         * after the panic_lock is acquired) from invoking panic again.
-         */
-        local_irq_disable();
-        /*
         * It's possible to come here directly from a panic-assertion and
         * not have preempt disabled. Some functions called from here want
         * preempt to be disabled. No point enabling it later though...
-         *
-         * Only one CPU is allowed to execute the panic code from here. For
-         * multiple parallel invocations of panic, all other CPUs either
-         * stop themself or will wait until they are stopped by the 1st CPU
-         * with smp_send_stop().
         */
-        if (!spin_trylock(&panic_lock))
+        preempt_disable();
-                panic_smp_self_stop();
        console_verbose();
        bust_spinlocks(1);
@@ -102,11 +84,7 @@ void panic(const char *fmt, ...)
        va_end(args);
        printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
 #ifdef CONFIG_DEBUG_BUGVERBOSE
-        /*
+        dump_stack();
-         * Avoid nested stack-dumping if a panic occurs during oops processing
-         */
-        if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
-                dump_stack();
 #endif
        /*
@@ -116,6 +94,8 @@ void panic(const char *fmt, ...)
         */
        crash_kexec(NULL);
+        kmsg_dump(KMSG_DUMP_PANIC);
        /*
         * Note smp_send_stop is the usual smp shutdown function, which
         * unfortunately means it may not be hardened to work in a panic
@@ -123,8 +103,6 @@ void panic(const char *fmt, ...)
         */
        smp_send_stop();
-        kmsg_dump(KMSG_DUMP_PANIC);
        atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
        bust_spinlocks(0);
@@ -205,7 +183,6 @@ static const struct tnt tnts[] = {
        { TAINT_WARN,                   'W', ' ' },
        { TAINT_CRAP,                   'C', ' ' },
        { TAINT_FIRMWARE_WORKAROUND,    'I', ' ' },
-        { TAINT_OOT_MODULE,             'O', ' ' },
 };
 /**
@@ -223,7 +200,6 @@ static const struct tnt tnts[] = {
 *  'W' - Taint on warning.
 *  'C' - modules from drivers/staging are loaded.
 *  'I' - Working around severe firmware bug.
- *  'O' - Out-of-tree module has been loaded.
 *
 *      The string is overwritten by the next call to print_tainted().
 */
@@ -265,20 +241,11 @@ void add_taint(unsigned flag)
         * Can't trust the integrity of the kernel anymore.
         * We don't call directly debug_locks_off() because the issue
         * is not necessarily serious enough to set oops_in_progress to 1
-         * Also we want to keep up lockdep for staging/out-of-tree
+         * Also we want to keep up lockdep for staging development and
-         * development and post-warning case.
+         * post-warning case.
         */
-        switch (flag) {
+        if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
-        case TAINT_CRAP:
+                printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
-        case TAINT_OOT_MODULE:
-        case TAINT_WARN:
-        case TAINT_FIRMWARE_WORKAROUND:
-                break;
-        default:
-                if (__debug_locks_off())
-                        printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
-        }
        set_bit(flag, &tainted_mask);
 }
@@ -383,6 +350,11 @@ late_initcall(init_oops_id);
 void print_oops_end_marker(void)
 {
        init_oops_id();
+        if (mach_panic_string)
+                printk(KERN_WARNING "Board Information: %s\n",
+                       mach_panic_string);
        printk(KERN_WARNING "---[ end trace %016llx ]---\n",
                (unsigned long long)oops_id);
 }
diff --git a/kernel/params.c b/kernel/params.c
index ed35345be53..22df3e0d142 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,6 +15,7 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#include <linux/moduleparam.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
@@ -24,6 +25,12 @@
 #include <linux/slab.h>
 #include <linux/ctype.h>
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt, a...)
+#endif
 /* Protects all parameters, and incidentally kmalloced_param list. */
 static DEFINE_MUTEX(param_lock);
@@ -60,38 +67,27 @@ static void maybe_kfree_parameter(void *param)
        }
 }
-static char dash2underscore(char c)
+static inline char dash2underscore(char c)
 {
        if (c == '-')
                return '_';
        return c;
 }
-bool parameqn(const char *a, const char *b, size_t n)
+static inline int parameq(const char *input, const char *paramname)
 {
-        size_t i;
+        unsigned int i;
+        for (i = 0; dash2underscore(input[i]) == paramname[i]; i++)
-        for (i = 0; i < n; i++) {
+                if (input[i] == '\0')
-                if (dash2underscore(a[i]) != dash2underscore(b[i]))
+                        return 1;
-                        return false;
+        return 0;
-        }
-        return true;
-}
-bool parameq(const char *a, const char *b)
-{
-        return parameqn(a, b, strlen(a)+1);
 }
 static int parse_one(char *param,
                     char *val,
-                     const char *doing,
                     const struct kernel_param *params,
                     unsigned num_params,
-                     s16 min_level,
+                     int (*handle_unknown)(char *param, char *val))
-                     s16 max_level,
-                     int (*handle_unknown)(char *param, char *val,
-                                     const char *doing))
 {
        unsigned int i;
        int err;
@@ -99,15 +95,11 @@ static int parse_one(char *param,
        /* Find parameter */
        for (i = 0; i < num_params; i++) {
                if (parameq(param, params[i].name)) {
-                        if (params[i].level < min_level
-                            || params[i].level > max_level)
-                                return 0;
                        /* No one handled NULL, so do it here. */
-                        if (!val && params[i].ops->set != param_set_bool
+                        if (!val && params[i].ops->set != param_set_bool)
-                            && params[i].ops->set != param_set_bint)
                                return -EINVAL;
-                        pr_debug("handling %s with %p\n", param,
+                        DEBUGP("They are equal!  Calling %p\n",
-                                params[i].ops->set);
+                               params[i].ops->set);
                        mutex_lock(&param_lock);
                        err = params[i].ops->set(val, &params[i]);
                        mutex_unlock(&param_lock);
@@ -116,11 +108,11 @@ static int parse_one(char *param,
        }
        if (handle_unknown) {
-                pr_debug("doing %s: %s='%s'\n", doing, param, val);
+                DEBUGP("Unknown argument: calling %p\n", handle_unknown);
-                return handle_unknown(param, val, doing);
+                return handle_unknown(param, val);
        }
-        pr_debug("Unknown argument '%s'\n", param);
+        DEBUGP("Unknown argument `%s'\n", param);
        return -ENOENT;
 }
@@ -177,47 +169,46 @@ static char *next_arg(char *args, char **param, char **val)
 }
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
-int parse_args(const char *doing,
+int parse_args(const char *name,
               char *args,
               const struct kernel_param *params,
               unsigned num,
-               s16 min_level,
+               int (*unknown)(char *param, char *val))
-               s16 max_level,
-               int (*unknown)(char *param, char *val, const char *doing))
 {
        char *param, *val;
+        DEBUGP("Parsing ARGS: %s\n", args);
        /* Chew leading spaces */
        args = skip_spaces(args);
-        if (*args)
-                pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args);
        while (*args) {
                int ret;
                int irq_was_disabled;
                args = next_arg(args, &param, &val);
                irq_was_disabled = irqs_disabled();
-                ret = parse_one(param, val, doing, params, num,
+                ret = parse_one(param, val, params, num, unknown);
-                                min_level, max_level, unknown);
+                if (irq_was_disabled && !irqs_disabled()) {
-                if (irq_was_disabled && !irqs_disabled())
+                        printk(KERN_WARNING "parse_args(): option '%s' enabled "
-                        pr_warn("%s: option '%s' enabled irq's!\n",
+                                        "irq's!\n", param);
-                                doing, param);
+                }
                switch (ret) {
                case -ENOENT:
-                        pr_err("%s: Unknown parameter `%s'\n", doing, param);
+                        printk(KERN_ERR "%s: Unknown parameter `%s'\n",
+                               name, param);
                        return ret;
                case -ENOSPC:
-                        pr_err("%s: `%s' too large for parameter `%s'\n",
+                        printk(KERN_ERR
-                               doing, val ?: "", param);
+                               "%s: `%s' too large for parameter `%s'\n",
+                               name, val ?: "", param);
                        return ret;
                case 0:
                        break;
                default:
-                        pr_err("%s: `%s' invalid for parameter `%s'\n",
+                        printk(KERN_ERR
-                               doing, val ?: "", param);
+                               "%s: `%s' invalid for parameter `%s'\n",
+                               name, val ?: "", param);
                        return ret;
                }
        }
@@ -263,7 +254,8 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
 int param_set_charp(const char *val, const struct kernel_param *kp)
 {
        if (strlen(val) > 1024) {
-                pr_err("%s: string parameter too long\n", kp->name);
+                printk(KERN_ERR "%s: string parameter too long\n",
+                       kp->name);
                return -ENOSPC;
        }
@@ -304,18 +296,35 @@ EXPORT_SYMBOL(param_ops_charp);
 /* Actually could be a bool or an int, for historical reasons. */
 int param_set_bool(const char *val, const struct kernel_param *kp)
 {
+        bool v;
+        int ret;
        /* No equals means "set"... */
        if (!val) val = "1";
        /* One of =[yYnN01] */
-        return strtobool(val, kp->arg);
+        ret = strtobool(val, &v);
+        if (ret)
+                return ret;
+        if (kp->flags & KPARAM_ISBOOL)
+                *(bool *)kp->arg = v;
+        else
+                *(int *)kp->arg = v;
+        return 0;
 }
 EXPORT_SYMBOL(param_set_bool);
 int param_get_bool(char *buffer, const struct kernel_param *kp)
 {
+        bool val;
+        if (kp->flags & KPARAM_ISBOOL)
+                val = *(bool *)kp->arg;
+        else
+                val = *(int *)kp->arg;
        /* Y and N chosen as being relatively non-coder friendly */
-        return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
+        return sprintf(buffer, "%c", val ? 'Y' : 'N');
 }
 EXPORT_SYMBOL(param_get_bool);
@@ -333,6 +342,7 @@ int param_set_invbool(const char *val, const struct kernel_param *kp)
        struct kernel_param dummy;
        dummy.arg = &boolval;
+        dummy.flags = KPARAM_ISBOOL;
        ret = param_set_bool(val, &dummy);
        if (ret == 0)
                *(bool *)kp->arg = !boolval;
@@ -352,36 +362,13 @@ struct kernel_param_ops param_ops_invbool = {
 };
 EXPORT_SYMBOL(param_ops_invbool);
-int param_set_bint(const char *val, const struct kernel_param *kp)
-{
-        struct kernel_param boolkp;
-        bool v;
-        int ret;
-        /* Match bool exactly, by re-using it. */
-        boolkp = *kp;
-        boolkp.arg = &v;
-        ret = param_set_bool(val, &boolkp);
-        if (ret == 0)
-                *(int *)kp->arg = v;
-        return ret;
-}
-EXPORT_SYMBOL(param_set_bint);
-struct kernel_param_ops param_ops_bint = {
-        .set = param_set_bint,
-        .get = param_get_int,
-};
-EXPORT_SYMBOL(param_ops_bint);
 /* We break the rule and mangle the string. */
 static int param_array(const char *name,
                       const char *val,
                       unsigned int min, unsigned int max,
                       void *elem, int elemsize,
                       int (*set)(const char *, const struct kernel_param *kp),
-                       s16 level,
+                       u16 flags,
                       unsigned int *num)
 {
        int ret;
@@ -391,7 +378,7 @@ static int param_array(const char *name,
        /* Get the name right for errors. */
        kp.name = name;
        kp.arg = elem;
-        kp.level = level;
+        kp.flags = flags;
        *num = 0;
        /* We expect a comma-separated list of values. */
@@ -399,7 +386,8 @@ static int param_array(const char *name,
                int len;
                if (*num == max) {
-                        pr_err("%s: can only take %i arguments\n", name, max);
+                        printk(KERN_ERR "%s: can only take %i arguments\n",
+                               name, max);
                        return -EINVAL;
                }
                len = strcspn(val, ",");
@@ -418,7 +406,8 @@ static int param_array(const char *name,
        } while (save == ',');
        if (*num < min) {
-                pr_err("%s: needs at least %i arguments\n", name, min);
+                printk(KERN_ERR "%s: needs at least %i arguments\n",
+                       name, min);
                return -EINVAL;
        }
        return 0;
@@ -430,7 +419,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
        unsigned int temp_num;
        return param_array(kp->name, val, 1, arr->max, arr->elem,
-                           arr->elemsize, arr->ops->set, kp->level,
+                           arr->elemsize, arr->ops->set, kp->flags,
                           arr->num ?: &temp_num);
 }
@@ -477,7 +466,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)
        const struct kparam_string *kps = kp->str;
        if (strlen(val)+1 > kps->maxlen) {
-                pr_err("%s: string doesn't fit in %u chars.\n",
+                printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
                       kp->name, kps->maxlen-1);
                return -ENOSPC;
        }
@@ -747,8 +736,11 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
 #endif
                if (err) {
                        kobject_put(&mk->kobj);
-                        pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
+                        printk(KERN_ERR
+                                "Module '%s' failed add to sysfs, error number %d\n",
                                name, err);
+                        printk(KERN_ERR
+                                "The system will be unstable now.\n");
                        return NULL;
                }
diff --git a/kernel/pid.c b/kernel/pid.c
index de9af600006..e432057f3b2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,8 +1,8 @@
 /*
 * Generic pidhash and scalable, time-bounded PID allocator
 *
- * (C) 2002-2003 Nadia Yvette Chambers, IBM
+ * (C) 2002-2003 William Irwin, IBM
- * (C) 2004 Nadia Yvette Chambers, Oracle
+ * (C) 2004 William Irwin, Oracle
 * (C) 2002-2004 Ingo Molnar, Red Hat
 *
 * pid-structures are backing objects for tasks sharing a given ID to chain
@@ -27,7 +27,7 @@
 */
 #include <linux/mm.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/rculist.h>
@@ -36,7 +36,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
-#include <linux/proc_fs.h>
 #define pid_hashfn(nr, ns)      \
        hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -79,11 +78,24 @@ struct pid_namespace init_pid_ns = {
        .last_pid = 0,
        .level = 0,
        .child_reaper = &init_task,
-        .user_ns = &init_user_ns,
-        .proc_inum = PROC_PID_INIT_INO,
 };
 EXPORT_SYMBOL_GPL(init_pid_ns);
+int is_container_init(struct task_struct *tsk)
+{
+        int ret = 0;
+        struct pid *pid;
+        rcu_read_lock();
+        pid = task_pid(tsk);
+        if (pid != NULL && pid->numbers[pid->level].nr == 1)
+                ret = 1;
+        rcu_read_unlock();
+        return ret;
+}
+EXPORT_SYMBOL(is_container_init);
 /*
 * Note: disable interrupts while the pidmap_lock is held as an
 * interrupt might come in and do read_lock(&tasklist_lock).
@@ -125,9 +137,7 @@ static int pid_before(int base, int a, int b)
 }
 /*
- * We might be racing with someone else trying to set pid_ns->last_pid
+ * We might be racing with someone else trying to set pid_ns->last_pid.
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
 * We want the winner to have the "later" value, because if the
 * "earlier" value prevails, then a pid may get reused immediately.
 *
@@ -257,23 +267,8 @@ void free_pid(struct pid *pid)
        unsigned long flags;
        spin_lock_irqsave(&pidmap_lock, flags);
-        for (i = 0; i <= pid->level; i++) {
+        for (i = 0; i <= pid->level; i++)
-                struct upid *upid = pid->numbers + i;
+                hlist_del_rcu(&pid->numbers[i].pid_chain);
-                struct pid_namespace *ns = upid->ns;
-                hlist_del_rcu(&upid->pid_chain);
-                switch(--ns->nr_hashed) {
-                case 1:
-                        /* When all that is left in the pid namespace
-                         * is the reaper wake up the reaper.  The reaper
-                         * may be sleeping in zap_pid_ns_processes().
-                         */
-                        wake_up_process(ns->child_reaper);
-                        break;
-                case 0:
-                        schedule_work(&ns->proc_work);
-                        break;
-                }
-        }
        spin_unlock_irqrestore(&pidmap_lock, flags);
        for (i = 0; i <= pid->level; i++)
@@ -295,7 +290,6 @@ struct pid *alloc_pid(struct pid_namespace *ns)
                goto out;
        tmp = ns;
-        pid->level = ns->level;
        for (i = ns->level; i >= 0; i--) {
                nr = alloc_pidmap(tmp);
                if (nr < 0)
@@ -306,32 +300,22 @@ struct pid *alloc_pid(struct pid_namespace *ns)
                tmp = tmp->parent;
        }
-        if (unlikely(is_child_reaper(pid))) {
-                if (pid_ns_prepare_proc(ns))
-                        goto out_free;
-        }
        get_pid_ns(ns);
+        pid->level = ns->level;
        atomic_set(&pid->count, 1);
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);
        upid = pid->numbers + ns->level;
        spin_lock_irq(&pidmap_lock);
-        if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+        for ( ; upid >= pid->numbers; --upid)
-                goto out_unlock;
-        for ( ; upid >= pid->numbers; --upid) {
                hlist_add_head_rcu(&upid->pid_chain,
                                &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
-                upid->ns->nr_hashed++;
-        }
        spin_unlock_irq(&pidmap_lock);
 out:
        return pid;
-out_unlock:
-        spin_unlock(&pidmap_lock);
 out_free:
        while (++i <= ns->level)
                free_pidmap(pid->numbers + i);
@@ -341,13 +325,6 @@ out_free:
        goto out;
 }
-void disable_pid_allocation(struct pid_namespace *ns)
-{
-        spin_lock_irq(&pidmap_lock);
-        ns->nr_hashed &= ~PIDNS_HASH_ADDING;
-        spin_unlock_irq(&pidmap_lock);
-}
 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 {
        struct hlist_node *elem;
@@ -365,7 +342,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
 struct pid *find_vpid(int nr)
 {
-        return find_pid_ns(nr, task_active_pid_ns(current));
+        return find_pid_ns(nr, current->nsproxy->pid_ns);
 }
 EXPORT_SYMBOL_GPL(find_vpid);
@@ -441,15 +418,13 @@ EXPORT_SYMBOL(pid_task);
 */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
-        rcu_lockdep_assert(rcu_read_lock_held(),
+        rcu_lockdep_assert(rcu_read_lock_held());
-                           "find_task_by_pid_ns() needs rcu_read_lock()"
-                           " protection");
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
 struct task_struct *find_task_by_vpid(pid_t vnr)
 {
-        return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
+        return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
 }
 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -500,11 +475,10 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
        }
        return nr;
 }
-EXPORT_SYMBOL_GPL(pid_nr_ns);
 pid_t pid_vnr(struct pid *pid)
 {
-        return pid_nr_ns(pid, task_active_pid_ns(current));
+        return pid_nr_ns(pid, current->nsproxy->pid_ns);
 }
 EXPORT_SYMBOL_GPL(pid_vnr);
@@ -515,7 +489,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
        rcu_read_lock();
        if (!ns)
-                ns = task_active_pid_ns(current);
+                ns = current->nsproxy->pid_ns;
        if (likely(pid_alive(task))) {
                if (type != PIDTYPE_PID)
                        task = task->group_leader;
@@ -565,13 +539,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 */
 void __init pidhash_init(void)
 {
-        unsigned int i, pidhash_size;
+        int i, pidhash_size;
        pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
                                           HASH_EARLY | HASH_SMALL,
-                                           &pidhash_shift, NULL,
+                                           &pidhash_shift, NULL, 4096);
-                                           0, 4096);
+        pidhash_size = 1 << pidhash_shift;
-        pidhash_size = 1U << pidhash_shift;
        for (i = 0; i < pidhash_size; i++)
                INIT_HLIST_HEAD(&pid_hash[i]);
@@ -579,9 +552,6 @@ void __init pidhash_init(void)
 void __init pidmap_init(void)
 {
-        /* Veryify no one has done anything silly */
-        BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
        /* bump default and minimum pid_max based on number of cpus */
        pid_max = min(pid_max_max, max_t(int, pid_max,
                                PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
@@ -593,7 +563,6 @@ void __init pidmap_init(void)
        /* Reserve PID 0. We never call free_pidmap(0) */
        set_bit(0, init_pid_ns.pidmap[0].page);
        atomic_dec(&init_pid_ns.pidmap[0].nr_free);
-        init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;
        init_pid_ns.pid_cachep = KMEM_CACHE(pid,
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index c1c3dc1c602..e9c9adc84ca 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -10,14 +10,11 @@
 #include <linux/pid.h>
 #include <linux/pid_namespace.h>
-#include <linux/user_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
-#include <linux/reboot.h>
-#include <linux/export.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -72,29 +69,12 @@ err_alloc:
        return NULL;
 }
-static void proc_cleanup_work(struct work_struct *work)
+static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
-{
-        struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
-        pid_ns_release_proc(ns);
-}
-/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
-#define MAX_PID_NS_LEVEL 32
-static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
-        struct pid_namespace *parent_pid_ns)
 {
        struct pid_namespace *ns;
        unsigned int level = parent_pid_ns->level + 1;
-        int i;
+        int i, err = -ENOMEM;
-        int err;
-        if (level > MAX_PID_NS_LEVEL) {
-                err = -EINVAL;
-                goto out;
-        }
-        err = -ENOMEM;
        ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
        if (ns == NULL)
                goto out;
@@ -107,16 +87,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
        if (ns->pid_cachep == NULL)
                goto out_free_map;
-        err = proc_alloc_inum(&ns->proc_inum);
-        if (err)
-                goto out_free_map;
        kref_init(&ns->kref);
        ns->level = level;
        ns->parent = get_pid_ns(parent_pid_ns);
-        ns->user_ns = get_user_ns(user_ns);
-        ns->nr_hashed = PIDNS_HASH_ADDING;
-        INIT_WORK(&ns->proc_work, proc_cleanup_work);
        set_bit(0, ns->pidmap[0].page);
        atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -124,8 +97,14 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
        for (i = 1; i < PIDMAP_ENTRIES; i++)
                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+        err = pid_ns_prepare_proc(ns);
+        if (err)
+                goto out_put_parent_pid_ns;
        return ns;
+out_put_parent_pid_ns:
+        put_pid_ns(parent_pid_ns);
 out_free_map:
        kfree(ns->pidmap[0].page);
 out_free:
@@ -138,57 +117,38 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
 {
        int i;
-        proc_free_inum(ns->proc_inum);
        for (i = 0; i < PIDMAP_ENTRIES; i++)
                kfree(ns->pidmap[i].page);
-        put_user_ns(ns->user_ns);
        kmem_cache_free(pid_ns_cachep, ns);
 }
-struct pid_namespace *copy_pid_ns(unsigned long flags,
+struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
-        struct user_namespace *user_ns, struct pid_namespace *old_ns)
 {
        if (!(flags & CLONE_NEWPID))
                return get_pid_ns(old_ns);
-        if (task_active_pid_ns(current) != old_ns)
+        if (flags & (CLONE_THREAD|CLONE_PARENT))
                return ERR_PTR(-EINVAL);
-        return create_pid_namespace(user_ns, old_ns);
+        return create_pid_namespace(old_ns);
 }
-static void free_pid_ns(struct kref *kref)
+void free_pid_ns(struct kref *kref)
 {
-        struct pid_namespace *ns;
+        struct pid_namespace *ns, *parent;
        ns = container_of(kref, struct pid_namespace, kref);
-        destroy_pid_namespace(ns);
-}
-void put_pid_ns(struct pid_namespace *ns)
+        parent = ns->parent;
-{
+        destroy_pid_namespace(ns);
-        struct pid_namespace *parent;
-        while (ns != &init_pid_ns) {
+        if (parent != NULL)
-                parent = ns->parent;
+                put_pid_ns(parent);
-                if (!kref_put(&ns->kref, free_pid_ns))
-                        break;
-                ns = parent;
-        }
 }
-EXPORT_SYMBOL_GPL(put_pid_ns);
 void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 {
        int nr;
        int rc;
-        struct task_struct *task, *me = current;
+        struct task_struct *task;
-        /* Don't allow any more processes into the pid namespace */
-        disable_pid_allocation(pid_ns);
-        /* Ignore SIGCHLD causing any terminated children to autoreap */
-        spin_lock_irq(&me->sighand->siglock);
-        me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
-        spin_unlock_irq(&me->sighand->siglock);
        /*
         * The last thread in the cgroup-init thread group is terminating.
@@ -208,9 +168,13 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        while (nr > 0) {
                rcu_read_lock();
+                /*
+                 * Any nested-container's init processes won't ignore the
+                 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
+                 */
                task = pid_task(find_vpid(nr), PIDTYPE_PID);
-                if (task && !__fatal_signal_pending(task))
+                if (task)
-                        send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
+                        send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
                rcu_read_unlock();
@@ -218,165 +182,18 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        }
        read_unlock(&tasklist_lock);
-        /* Firstly reap the EXIT_ZOMBIE children we may have. */
        do {
                clear_thread_flag(TIF_SIGPENDING);
                rc = sys_wait4(-1, NULL, __WALL, NULL);
        } while (rc != -ECHILD);
-        /*
-         * sys_wait4() above can't reap the TASK_DEAD children.
-         * Make sure they all go away, see free_pid().
-         */
-        for (;;) {
-                set_current_state(TASK_UNINTERRUPTIBLE);
-                if (pid_ns->nr_hashed == 1)
-                        break;
-                schedule();
-        }
-        __set_current_state(TASK_RUNNING);
-        if (pid_ns->reboot)
-                current->signal->group_exit_code = pid_ns->reboot;
        acct_exit_ns(pid_ns);
        return;
 }
-#ifdef CONFIG_CHECKPOINT_RESTORE
-static int pid_ns_ctl_handler(struct ctl_table *table, int write,
-                void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        struct pid_namespace *pid_ns = task_active_pid_ns(current);
-        struct ctl_table tmp = *table;
-        if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
-                return -EPERM;
-        /*
-         * Writing directly to ns' last_pid field is OK, since this field
-         * is volatile in a living namespace anyway and a code writing to
-         * it should synchronize its usage with external means.
-         */
-        tmp.data = &pid_ns->last_pid;
-        return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
-}
-extern int pid_max;
-static int zero = 0;
-static struct ctl_table pid_ns_ctl_table[] = {
-        {
-                .procname = "ns_last_pid",
-                .maxlen = sizeof(int),
-                .mode = 0666, /* permissions are checked in the handler */
-                .proc_handler = pid_ns_ctl_handler,
-                .extra1 = &zero,
-                .extra2 = &pid_max,
-        },
-        { }
-};
-static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
-#endif  /* CONFIG_CHECKPOINT_RESTORE */
-int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
-{
-        if (pid_ns == &init_pid_ns)
-                return 0;
-        switch (cmd) {
-        case LINUX_REBOOT_CMD_RESTART2:
-        case LINUX_REBOOT_CMD_RESTART:
-                pid_ns->reboot = SIGHUP;
-                break;
-        case LINUX_REBOOT_CMD_POWER_OFF:
-        case LINUX_REBOOT_CMD_HALT:
-                pid_ns->reboot = SIGINT;
-                break;
-        default:
-                return -EINVAL;
-        }
-        read_lock(&tasklist_lock);
-        force_sig(SIGKILL, pid_ns->child_reaper);
-        read_unlock(&tasklist_lock);
-        do_exit(0);
-        /* Not reached */
-        return 0;
-}
-static void *pidns_get(struct task_struct *task)
-{
-        struct pid_namespace *ns;
-        rcu_read_lock();
-        ns = get_pid_ns(task_active_pid_ns(task));
-        rcu_read_unlock();
-        return ns;
-}
-static void pidns_put(void *ns)
-{
-        put_pid_ns(ns);
-}
-static int pidns_install(struct nsproxy *nsproxy, void *ns)
-{
-        struct pid_namespace *active = task_active_pid_ns(current);
-        struct pid_namespace *ancestor, *new = ns;
-        if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
-            !nsown_capable(CAP_SYS_ADMIN))
-                return -EPERM;
-        /*
-         * Only allow entering the current active pid namespace
-         * or a child of the current active pid namespace.
-         *
-         * This is required for fork to return a usable pid value and
-         * this maintains the property that processes and their
-         * children can not escape their current pid namespace.
-         */
-        if (new->level < active->level)
-                return -EINVAL;
-        ancestor = new;
-        while (ancestor->level > active->level)
-                ancestor = ancestor->parent;
-        if (ancestor != active)
-                return -EINVAL;
-        put_pid_ns(nsproxy->pid_ns);
-        nsproxy->pid_ns = get_pid_ns(new);
-        return 0;
-}
-static unsigned int pidns_inum(void *ns)
-{
-        struct pid_namespace *pid_ns = ns;
-        return pid_ns->proc_inum;
-}
-const struct proc_ns_operations pidns_operations = {
-        .name           = "pid",
-        .type           = CLONE_NEWPID,
-        .get            = pidns_get,
-        .put            = pidns_put,
-        .install        = pidns_install,
-        .inum           = pidns_inum,
-};
 static __init int pid_namespaces_init(void)
 {
        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
-#ifdef CONFIG_CHECKPOINT_RESTORE
-        register_sysctl_paths(kern_path, pid_ns_ctl_table);
-#endif
        return 0;
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index a278cad1d5d..640ded8f5c4 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -9,7 +9,6 @@
 #include <asm/uaccess.h>
 #include <linux/kernel_stat.h>
 #include <trace/events/timer.h>
-#include <linux/random.h>
 /*
 * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -79,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock,
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
                return now.sched < then.sched;
        }  else {
-                return now.cpu < then.cpu;
+                return cputime_lt(now.cpu, then.cpu);
        }
 }
 static inline void cpu_time_add(const clockid_t which_clock,
@@ -89,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock,
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
                acc->sched += val.sched;
        }  else {
-                acc->cpu += val.cpu;
+                acc->cpu = cputime_add(acc->cpu, val.cpu);
        }
 }
 static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
@@ -99,12 +98,25 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
                a.sched -= b.sched;
        }  else {
-                a.cpu -= b.cpu;
+                a.cpu = cputime_sub(a.cpu, b.cpu);
        }
        return a;
 }
 /*
+ * Divide and limit the result to res >= 1
+ *
+ * This is necessary to prevent signal delivery starvation, when the result of
+ * the division would be rounded down to 0.
+ */
+static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
+{
+        cputime_t res = cputime_div(time, div);
+        return max_t(cputime_t, res, 1);
+}
+/*
 * Update expiry time from increment, and increase overrun count,
 * given the current clock sample.
 */
@@ -136,26 +148,28 @@ static void bump_cpu_timer(struct k_itimer *timer,
        } else {
                cputime_t delta, incr;
-                if (now.cpu < timer->it.cpu.expires.cpu)
+                if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu))
                        return;
                incr = timer->it.cpu.incr.cpu;
-                delta = now.cpu + incr - timer->it.cpu.expires.cpu;
+                delta = cputime_sub(cputime_add(now.cpu, incr),
+                                    timer->it.cpu.expires.cpu);
                /* Don't use (incr*2 < delta), incr*2 might overflow. */
-                for (i = 0; incr < delta - incr; i++)
+                for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
-                             incr += incr;
+                             incr = cputime_add(incr, incr);
-                for (; i >= 0; incr = incr >> 1, i--) {
+                for (; i >= 0; incr = cputime_halve(incr), i--) {
-                        if (delta < incr)
+                        if (cputime_lt(delta, incr))
                                continue;
-                        timer->it.cpu.expires.cpu += incr;
+                        timer->it.cpu.expires.cpu =
+                                cputime_add(timer->it.cpu.expires.cpu, incr);
                        timer->it_overrun += 1 << i;
-                        delta -= incr;
+                        delta = cputime_sub(delta, incr);
                }
        }
 }
 static inline cputime_t prof_ticks(struct task_struct *p)
 {
-        return p->utime + p->stime;
+        return cputime_add(p->utime, p->stime);
 }
 static inline cputime_t virt_ticks(struct task_struct *p)
 {
@@ -218,12 +232,36 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
        return 0;
 }
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+        struct signal_struct *sig = tsk->signal;
+        struct task_struct *t;
+        times->utime = sig->utime;
+        times->stime = sig->stime;
+        times->sum_exec_runtime = sig->sum_sched_runtime;
+        rcu_read_lock();
+        /* make sure we can trust tsk->thread_group list */
+        if (!likely(pid_alive(tsk)))
+                goto out;
+        t = tsk;
+        do {
+                times->utime = cputime_add(times->utime, t->utime);
+                times->stime = cputime_add(times->stime, t->stime);
+                times->sum_exec_runtime += task_sched_runtime(t);
+        } while_each_thread(tsk, t);
+out:
+        rcu_read_unlock();
+}
 static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
 {
-        if (b->utime > a->utime)
+        if (cputime_gt(b->utime, a->utime))
                a->utime = b->utime;
-        if (b->stime > a->stime)
+        if (cputime_gt(b->stime, a->stime))
                a->stime = b->stime;
        if (b->sum_exec_runtime > a->sum_exec_runtime)
@@ -244,13 +282,13 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
                 * it.
                 */
                thread_group_cputime(tsk, &sum);
-                raw_spin_lock_irqsave(&cputimer->lock, flags);
+                spin_lock_irqsave(&cputimer->lock, flags);
                cputimer->running = 1;
                update_gt_cputime(&cputimer->cputime, &sum);
        } else
-                raw_spin_lock_irqsave(&cputimer->lock, flags);
+                spin_lock_irqsave(&cputimer->lock, flags);
        *times = cputimer->cputime;
-        raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+        spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 /*
@@ -268,7 +306,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
                return -EINVAL;
        case CPUCLOCK_PROF:
                thread_group_cputime(p, &cputime);
-                cpu->cpu = cputime.utime + cputime.stime;
+                cpu->cpu = cputime_add(cputime.utime, cputime.stime);
                break;
        case CPUCLOCK_VIRT:
                thread_group_cputime(p, &cputime);
@@ -432,24 +470,26 @@ static void cleanup_timers(struct list_head *head,
                           unsigned long long sum_exec_runtime)
 {
        struct cpu_timer_list *timer, *next;
-        cputime_t ptime = utime + stime;
+        cputime_t ptime = cputime_add(utime, stime);
        list_for_each_entry_safe(timer, next, head, entry) {
                list_del_init(&timer->entry);
-                if (timer->expires.cpu < ptime) {
+                if (cputime_lt(timer->expires.cpu, ptime)) {
-                        timer->expires.cpu = 0;
+                        timer->expires.cpu = cputime_zero;
                } else {
-                        timer->expires.cpu -= ptime;
+                        timer->expires.cpu = cputime_sub(timer->expires.cpu,
+                                                         ptime);
                }
        }
        ++head;
        list_for_each_entry_safe(timer, next, head, entry) {
                list_del_init(&timer->entry);
-                if (timer->expires.cpu < utime) {
+                if (cputime_lt(timer->expires.cpu, utime)) {
-                        timer->expires.cpu = 0;
+                        timer->expires.cpu = cputime_zero;
                } else {
-                        timer->expires.cpu -= utime;
+                        timer->expires.cpu = cputime_sub(timer->expires.cpu,
+                                                         utime);
                }
        }
@@ -471,8 +511,6 @@ static void cleanup_timers(struct list_head *head,
 */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
-        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
-                                                sizeof(unsigned long long));
        cleanup_timers(tsk->cpu_timers,
                       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
@@ -482,7 +520,8 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
        struct signal_struct *const sig = tsk->signal;
        cleanup_timers(tsk->signal->cpu_timers,
-                       tsk->utime + sig->utime, tsk->stime + sig->stime,
+                       cputime_add(tsk->utime, sig->utime),
+                       cputime_add(tsk->stime, sig->stime),
                       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
@@ -501,7 +540,8 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
 static inline int expires_gt(cputime_t expires, cputime_t new_exp)
 {
-        return expires == 0 || expires > new_exp;
+        return cputime_eq(expires, cputime_zero) ||
+               cputime_gt(expires, new_exp);
 }
 /*
@@ -611,7 +651,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
        default:
                return -EINVAL;
        case CPUCLOCK_PROF:
-                cpu->cpu = cputime.utime + cputime.stime;
+                cpu->cpu = cputime_add(cputime.utime, cputime.stime);
                break;
        case CPUCLOCK_VIRT:
                cpu->cpu = cputime.utime;
@@ -878,12 +918,12 @@ static void check_thread_timers(struct task_struct *tsk,
        unsigned long soft;
        maxfire = 20;
-        tsk->cputime_expires.prof_exp = 0;
+        tsk->cputime_expires.prof_exp = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
+                if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
                        tsk->cputime_expires.prof_exp = t->expires.cpu;
                        break;
                }
@@ -893,12 +933,12 @@ static void check_thread_timers(struct task_struct *tsk,
        ++timers;
        maxfire = 20;
-        tsk->cputime_expires.virt_exp = 0;
+        tsk->cputime_expires.virt_exp = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
+                if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
                        tsk->cputime_expires.virt_exp = t->expires.cpu;
                        break;
                }
@@ -959,9 +999,9 @@ static void stop_process_timers(struct signal_struct *sig)
        struct thread_group_cputimer *cputimer = &sig->cputimer;
        unsigned long flags;
-        raw_spin_lock_irqsave(&cputimer->lock, flags);
+        spin_lock_irqsave(&cputimer->lock, flags);
        cputimer->running = 0;
-        raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+        spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 static u32 onecputick;
@@ -969,19 +1009,20 @@ static u32 onecputick;
 static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
                             cputime_t *expires, cputime_t cur_time, int signo)
 {
-        if (!it->expires)
+        if (cputime_eq(it->expires, cputime_zero))
                return;
-        if (cur_time >= it->expires) {
+        if (cputime_ge(cur_time, it->expires)) {
-                if (it->incr) {
+                if (!cputime_eq(it->incr, cputime_zero)) {
-                        it->expires += it->incr;
+                        it->expires = cputime_add(it->expires, it->incr);
                        it->error += it->incr_error;
                        if (it->error >= onecputick) {
-                                it->expires -= cputime_one_jiffy;
+                                it->expires = cputime_sub(it->expires,
+                                                          cputime_one_jiffy);
                                it->error -= onecputick;
                        }
                } else {
-                        it->expires = 0;
+                        it->expires = cputime_zero;
                }
                trace_itimer_expire(signo == SIGPROF ?
@@ -990,7 +1031,9 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
                __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
        }
-        if (it->expires && (!*expires || it->expires < *expires)) {
+        if (!cputime_eq(it->expires, cputime_zero) &&
+            (cputime_eq(*expires, cputime_zero) ||
+             cputime_lt(it->expires, *expires))) {
                *expires = it->expires;
        }
 }
@@ -1005,7 +1048,9 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 */
 static inline int task_cputime_zero(const struct task_cputime *cputime)
 {
-        if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
+        if (cputime_eq(cputime->utime, cputime_zero) &&
+            cputime_eq(cputime->stime, cputime_zero) &&
+            cputime->sum_exec_runtime == 0)
                return 1;
        return 0;
 }
@@ -1031,15 +1076,15 @@ static void check_process_timers(struct task_struct *tsk,
         */
        thread_group_cputimer(tsk, &cputime);
        utime = cputime.utime;
-        ptime = utime + cputime.stime;
+        ptime = cputime_add(utime, cputime.stime);
        sum_sched_runtime = cputime.sum_exec_runtime;
        maxfire = 20;
-        prof_expires = 0;
+        prof_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *tl = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || ptime < tl->expires.cpu) {
+                if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) {
                        prof_expires = tl->expires.cpu;
                        break;
                }
@@ -1049,12 +1094,12 @@ static void check_process_timers(struct task_struct *tsk,
        ++timers;
        maxfire = 20;
-        virt_expires = 0;
+        virt_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *tl = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || utime < tl->expires.cpu) {
+                if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) {
                        virt_expires = tl->expires.cpu;
                        break;
                }
@@ -1109,7 +1154,8 @@ static void check_process_timers(struct task_struct *tsk,
                        }
                }
                x = secs_to_cputime(soft);
-                if (!prof_expires || x < prof_expires) {
+                if (cputime_eq(prof_expires, cputime_zero) ||
+                    cputime_lt(x, prof_expires)) {
                        prof_expires = x;
                }
        }
@@ -1203,9 +1249,12 @@ out:
 static inline int task_cputime_expired(const struct task_cputime *sample,
                                        const struct task_cputime *expires)
 {
-        if (expires->utime && sample->utime >= expires->utime)
+        if (!cputime_eq(expires->utime, cputime_zero) &&
+            cputime_ge(sample->utime, expires->utime))
                return 1;
-        if (expires->stime && sample->utime + sample->stime >= expires->stime)
+        if (!cputime_eq(expires->stime, cputime_zero) &&
+            cputime_ge(cputime_add(sample->utime, sample->stime),
+                       expires->stime))
                return 1;
        if (expires->sum_exec_runtime != 0 &&
            sample->sum_exec_runtime >= expires->sum_exec_runtime)
@@ -1242,9 +1291,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
        if (sig->cputimer.running) {
                struct task_cputime group_sample;
-                raw_spin_lock(&sig->cputimer.lock);
+                spin_lock(&sig->cputimer.lock);
                group_sample = sig->cputimer.cputime;
-                raw_spin_unlock(&sig->cputimer.lock);
+                spin_unlock(&sig->cputimer.lock);
                if (task_cputime_expired(&group_sample, &sig->cputime_expires))
                        return 1;
@@ -1340,18 +1389,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                 * it to be relative, *newval argument is relative and we update
                 * it to be absolute.
                 */
-                if (*oldval) {
+                if (!cputime_eq(*oldval, cputime_zero)) {
-                        if (*oldval <= now.cpu) {
+                        if (cputime_le(*oldval, now.cpu)) {
                                /* Just about to fire. */
                                *oldval = cputime_one_jiffy;
                        } else {
-                                *oldval -= now.cpu;
+                                *oldval = cputime_sub(*oldval, now.cpu);
                        }
                }
-                if (!*newval)
+                if (cputime_eq(*newval, cputime_zero))
                        return;
-                *newval += now.cpu;
+                *newval = cputime_add(*newval, now.cpu);
        }
        /*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69185ae6b70..4556182527f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -46,7 +46,7 @@
 #include <linux/syscalls.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
-#include <linux/export.h>
+#include <linux/module.h>
 /*
 * Management arrays for POSIX timers.   Timers are kept in slab memory
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5dfdc9ea180..fcf5a834c4e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,6 +18,73 @@ config SUSPEND_FREEZER
          Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config HAS_WAKELOCK
+        bool
+config HAS_EARLYSUSPEND
+        bool
+config WAKELOCK
+        bool "Wake lock"
+        depends on PM && RTC_CLASS
+        default n
+        select HAS_WAKELOCK
+        ---help---
+          Enable wakelocks. When user space request a sleep state the
+          sleep request will be delayed until no wake locks are held.
+config WAKELOCK_STAT
+        bool "Wake lock stats"
+        depends on WAKELOCK
+        default y
+        ---help---
+          Report wake lock stats in /proc/wakelocks
+config USER_WAKELOCK
+        bool "Userspace wake locks"
+        depends on WAKELOCK
+        default y
+        ---help---
+          User-space wake lock api. Write "lockname" or "lockname timeout"
+          to /sys/power/wake_lock lock and if needed create a wake lock.
+          Write "lockname" to /sys/power/wake_unlock to unlock a user wake
+          lock.
+config EARLYSUSPEND
+        bool "Early suspend"
+        depends on WAKELOCK
+        default y
+        select HAS_EARLYSUSPEND
+        ---help---
+          Call early suspend handlers when the user requested sleep state
+          changes.
+choice
+        prompt "User-space screen access"
+        default FB_EARLYSUSPEND if !FRAMEBUFFER_CONSOLE
+        default CONSOLE_EARLYSUSPEND
+        depends on HAS_EARLYSUSPEND
+        config NO_USER_SPACE_SCREEN_ACCESS_CONTROL
+                bool "None"
+        config CONSOLE_EARLYSUSPEND
+                bool "Console switch on early-suspend"
+                depends on HAS_EARLYSUSPEND && VT
+                ---help---
+                  Register early suspend handler to perform a console switch to
+                  when user-space should stop drawing to the screen and a switch
+                  back when it should resume.
+        config FB_EARLYSUSPEND
+                bool "Sysfs interface"
+                depends on HAS_EARLYSUSPEND
+                ---help---
+                  Register early suspend handler that notifies and waits for
+                  user-space through sysfs when user-space should stop drawing
+                  to the screen and notifies user-space when it should resume.
+endchoice
 config HIBERNATE_CALLBACKS
        bool
@@ -27,7 +94,6 @@ config HIBERNATION
        select HIBERNATE_CALLBACKS
        select LZO_COMPRESS
        select LZO_DECOMPRESS
-        select CRC32
        ---help---
          Enable the suspend to disk (STD) functionality, which is usually
          called "hibernation" in user interfaces.  STD checkpoints the
@@ -66,9 +132,6 @@ config HIBERNATION
          For more information take a look at <file:Documentation/power/swsusp.txt>.
-config ARCH_SAVE_PAGE_KEYS
-        bool
 config PM_STD_PARTITION
        string "Default resume partition"
        depends on HIBERNATION
@@ -103,33 +166,6 @@ config PM_SLEEP_SMP
        select HOTPLUG
        select HOTPLUG_CPU
-config PM_AUTOSLEEP
-        bool "Opportunistic sleep"
-        depends on PM_SLEEP
-        default n
-        ---help---
-        Allow the kernel to trigger a system transition into a global sleep
-        state automatically whenever there are no active wakeup sources.
-config PM_WAKELOCKS
-        bool "User space wakeup sources interface"
-        depends on PM_SLEEP
-        default n
-        ---help---
-        Allow user space to create, activate and deactivate wakeup source
-        objects with the help of a sysfs-based interface.
-config PM_WAKELOCKS_LIMIT
-        int "Maximum number of user space wakeup sources (0 = no limit)"
-        range 0 100000
-        default 100
-        depends on PM_WAKELOCKS
-config PM_WAKELOCKS_GC
-        bool "Garbage collector for user space wakeup sources"
-        depends on PM_WAKELOCKS
-        default y
 config PM_RUNTIME
        bool "Run-time PM core functionality"
        depends on !IA64_HP_SIM
@@ -175,7 +211,7 @@ config PM_TEST_SUSPEND
        You probably want to have your system's RTC driver statically
        linked, ensuring that it's available when this test runs.
-config PM_SLEEP_DEBUG
+config CAN_PM_TRACE
        def_bool y
        depends on PM_DEBUG && PM_SLEEP
@@ -196,7 +232,7 @@ config PM_TRACE
 config PM_TRACE_RTC
        bool "Suspend/resume event tracing"
-        depends on PM_SLEEP_DEBUG
+        depends on CAN_PM_TRACE
        depends on X86
        select PM_TRACE
        ---help---
@@ -263,14 +299,13 @@ config PM_GENERIC_DOMAINS
        bool
        depends on PM
-config PM_GENERIC_DOMAINS_SLEEP
-        def_bool y
-        depends on PM_SLEEP && PM_GENERIC_DOMAINS
 config PM_GENERIC_DOMAINS_RUNTIME
        def_bool y
        depends on PM_RUNTIME && PM_GENERIC_DOMAINS
-config CPU_PM
+config SUSPEND_TIME
-        bool
+        bool "Log time spent in suspend"
-        depends on SUSPEND || CPU_IDLE
+        ---help---
+          Prints the time spent in suspend in the kernel log, and
+          keeps statistics on the time spent in suspend in
+          /sys/kernel/debug/suspend_time
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 29472bff11e..9b224e16b19 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,15 +1,18 @@
 ccflags-$(CONFIG_PM_DEBUG)      := -DDEBUG
-obj-y                           += qos.o
 obj-$(CONFIG_PM)                += main.o
-obj-$(CONFIG_VT_CONSOLE_SLEEP)  += console.o
+obj-$(CONFIG_PM_SLEEP)          += console.o
 obj-$(CONFIG_FREEZER)           += process.o
 obj-$(CONFIG_SUSPEND)           += suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)   += suspend_test.o
 obj-$(CONFIG_HIBERNATION)       += hibernate.o snapshot.o swap.o user.o \
                                   block_io.o
-obj-$(CONFIG_PM_AUTOSLEEP)      += autosleep.o
+obj-$(CONFIG_WAKELOCK)          += wakelock.o
-obj-$(CONFIG_PM_WAKELOCKS)      += wakelock.o
+obj-$(CONFIG_USER_WAKELOCK)     += userwakelock.o
+obj-$(CONFIG_EARLYSUSPEND)      += earlysuspend.o
+obj-$(CONFIG_CONSOLE_EARLYSUSPEND)      += consoleearlysuspend.o
+obj-$(CONFIG_FB_EARLYSUSPEND)   += fbearlysuspend.o
+obj-$(CONFIG_SUSPEND_TIME)      += suspend_time.o
 obj-$(CONFIG_MAGIC_SYSRQ)       += poweroff.o
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
deleted file mode 100644
index ca304046d9e..00000000000
--- a/kernel/power/autosleep.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * kernel/power/autosleep.c
- *
- * Opportunistic sleep support.
- *
- * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
- */
-#include <linux/device.h>
-#include <linux/mutex.h>
-#include <linux/pm_wakeup.h>
-#include "power.h"
-static suspend_state_t autosleep_state;
-static struct workqueue_struct *autosleep_wq;
-/*
- * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source
- * is active, otherwise a deadlock with try_to_suspend() is possible.
- * Alternatively mutex_lock_interruptible() can be used.  This will then fail
- * if an auto_sleep cycle tries to freeze processes.
- */
-static DEFINE_MUTEX(autosleep_lock);
-static struct wakeup_source *autosleep_ws;
-static void try_to_suspend(struct work_struct *work)
-{
-        unsigned int initial_count, final_count;
-        if (!pm_get_wakeup_count(&initial_count, true))
-                goto out;
-        mutex_lock(&autosleep_lock);
-        if (!pm_save_wakeup_count(initial_count)) {
-                mutex_unlock(&autosleep_lock);
-                goto out;
-        }
-        if (autosleep_state == PM_SUSPEND_ON) {
-                mutex_unlock(&autosleep_lock);
-                return;
-        }
-        if (autosleep_state >= PM_SUSPEND_MAX)
-                hibernate();
-        else
-                pm_suspend(autosleep_state);
-        mutex_unlock(&autosleep_lock);
-        if (!pm_get_wakeup_count(&final_count, false))
-                goto out;
-        /*
-         * If the wakeup occured for an unknown reason, wait to prevent the
-         * system from trying to suspend and waking up in a tight loop.
-         */
-        if (final_count == initial_count)
-                schedule_timeout_uninterruptible(HZ / 2);
- out:
-        queue_up_suspend_work();
-}
-static DECLARE_WORK(suspend_work, try_to_suspend);
-void queue_up_suspend_work(void)
-{
-        if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
-                queue_work(autosleep_wq, &suspend_work);
-}
-suspend_state_t pm_autosleep_state(void)
-{
-        return autosleep_state;
-}
-int pm_autosleep_lock(void)
-{
-        return mutex_lock_interruptible(&autosleep_lock);
-}
-void pm_autosleep_unlock(void)
-{
-        mutex_unlock(&autosleep_lock);
-}
-int pm_autosleep_set_state(suspend_state_t state)
-{
-#ifndef CONFIG_HIBERNATION
-        if (state >= PM_SUSPEND_MAX)
-                return -EINVAL;
-#endif
-        __pm_stay_awake(autosleep_ws);
-        mutex_lock(&autosleep_lock);
-        autosleep_state = state;
-        __pm_relax(autosleep_ws);
-        if (state > PM_SUSPEND_ON) {
-                pm_wakep_autosleep_enabled(true);
-                queue_up_suspend_work();
-        } else {
-                pm_wakep_autosleep_enabled(false);
-        }
-        mutex_unlock(&autosleep_lock);
-        return 0;
-}
-int __init pm_autosleep_init(void)
-{
-        autosleep_ws = wakeup_source_register("autosleep");
-        if (!autosleep_ws)
-                return -ENOMEM;
-        autosleep_wq = alloc_ordered_workqueue("autosleep", 0);
-        if (autosleep_wq)
-                return 0;
-        wakeup_source_unregister(autosleep_ws);
-        return -ENOMEM;
-}
diff --git a/kernel/power/console.c b/kernel/power/console.c
index b1dc456474b..218e5af9015 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -1,5 +1,5 @@
 /*
- * Functions for saving/restoring console.
+ * drivers/power/process.c - Functions for saving/restoring console.
 *
 * Originally from swsusp.
 */
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include "power.h"
+#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
 #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
 static int orig_fgconsole, orig_kmsg;
@@ -31,3 +32,4 @@ void pm_restore_console(void)
                vt_kmsg_redirect(orig_kmsg);
        }
 }
+#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26f5f1e773..8f7b1db1ece 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -5,18 +5,16 @@
 * Copyright (c) 2003 Open Source Development Lab
 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
- * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
 *
 * This file is released under the GPLv2.
 */
-#include <linux/export.h>
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
 #include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/device.h>
-#include <linux/async.h>
+#include <linux/kmod.h>
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -26,29 +24,25 @@
 #include <linux/freezer.h>
 #include <linux/gfp.h>
 #include <linux/syscore_ops.h>
-#include <linux/ctype.h>
+#include <scsi/scsi_scan.h>
-#include <linux/genhd.h>
 #include "power.h"
-static int nocompress;
+static int nocompress = 0;
-static int noresume;
+static int noresume = 0;
-static int resume_wait;
-static int resume_delay;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
-int in_suspend __nosavedata;
+int in_suspend __nosavedata = 0;
 enum {
        HIBERNATION_INVALID,
        HIBERNATION_PLATFORM,
+        HIBERNATION_TEST,
+        HIBERNATION_TESTPROC,
        HIBERNATION_SHUTDOWN,
        HIBERNATION_REBOOT,
-#ifdef CONFIG_SUSPEND
-        HIBERNATION_SUSPEND,
-#endif
        /* keep last */
        __HIBERNATION_AFTER_LAST
 };
@@ -57,8 +51,6 @@ enum {
 static int hibernation_mode = HIBERNATION_SHUTDOWN;
-bool freezer_test_done;
 static const struct platform_hibernation_ops *hibernation_ops;
 /**
@@ -73,14 +65,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)
                WARN_ON(1);
                return;
        }
-        lock_system_sleep();
+        mutex_lock(&pm_mutex);
        hibernation_ops = ops;
        if (ops)
                hibernation_mode = HIBERNATION_PLATFORM;
        else if (hibernation_mode == HIBERNATION_PLATFORM)
                hibernation_mode = HIBERNATION_SHUTDOWN;
-        unlock_system_sleep();
+        mutex_unlock(&pm_mutex);
 }
 static bool entering_platform_hibernation;
@@ -98,6 +90,15 @@ static void hibernation_debug_sleep(void)
        mdelay(5000);
 }
+static int hibernation_testmode(int mode)
+{
+        if (hibernation_mode == mode) {
+                hibernation_debug_sleep();
+                return 1;
+        }
+        return 0;
+}
 static int hibernation_test(int level)
 {
        if (pm_test_level == level) {
@@ -107,6 +108,7 @@ static int hibernation_test(int level)
        return 0;
 }
 #else /* !CONFIG_PM_DEBUG */
+static int hibernation_testmode(int mode) { return 0; }
 static int hibernation_test(int level) { return 0; }
 #endif /* !CONFIG_PM_DEBUG */
@@ -249,8 +251,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
 * create_image - Create a hibernation image.
 * @platform_mode: Whether or not to use the platform driver.
 *
- * Execute device drivers' "late" and "noirq" freeze callbacks, create a
+ * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
- * hibernation image and run the drivers' "noirq" and "early" thaw callbacks.
+ * and execute the drivers' .thaw_noirq() callbacks.
 *
 * Control reappears in this routine after the subsequent restore.
 */
@@ -258,7 +260,7 @@ static int create_image(int platform_mode)
 {
        int error;
-        error = dpm_suspend_end(PMSG_FREEZE);
+        error = dpm_suspend_noirq(PMSG_FREEZE);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down, "
                        "aborting hibernation\n");
@@ -270,7 +272,8 @@ static int create_image(int platform_mode)
                goto Platform_finish;
        error = disable_nonboot_cpus();
-        if (error || hibernation_test(TEST_CPUS))
+        if (error || hibernation_test(TEST_CPUS)
+            || hibernation_testmode(HIBERNATION_TEST))
                goto Enable_cpus;
        local_irq_disable();
@@ -310,7 +313,7 @@ static int create_image(int platform_mode)
 Platform_finish:
        platform_finish(platform_mode);
-        dpm_resume_start(in_suspend ?
+        dpm_resume_noirq(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
        return error;
@@ -324,55 +327,38 @@ static int create_image(int platform_mode)
 */
 int hibernation_snapshot(int platform_mode)
 {
-        pm_message_t msg;
+        pm_message_t msg = PMSG_RECOVER;
        int error;
        error = platform_begin(platform_mode);
        if (error)
                goto Close;
-        /* Preallocate image memory before shutting down devices. */
+        error = dpm_prepare(PMSG_FREEZE);
-        error = hibernate_preallocate_memory();
        if (error)
-                goto Close;
+                goto Complete_devices;
-        error = freeze_kernel_threads();
+        /* Preallocate image memory before shutting down devices. */
+        error = hibernate_preallocate_memory();
        if (error)
-                goto Cleanup;
+                goto Complete_devices;
-        if (hibernation_test(TEST_FREEZER)) {
-                /*
-                 * Indicate to the caller that we are returning due to a
-                 * successful freezer test.
-                 */
-                freezer_test_done = true;
-                goto Thaw;
-        }
-        error = dpm_prepare(PMSG_FREEZE);
-        if (error) {
-                dpm_complete(PMSG_RECOVER);
-                goto Thaw;
-        }
        suspend_console();
-        ftrace_stop();
        pm_restrict_gfp_mask();
        error = dpm_suspend(PMSG_FREEZE);
+        if (error)
+                goto Recover_platform;
-        if (error || hibernation_test(TEST_DEVICES))
+        if (hibernation_test(TEST_DEVICES))
-                platform_recover(platform_mode);
+                goto Recover_platform;
-        else
-                error = create_image(platform_mode);
+        error = create_image(platform_mode);
        /*
-         * In the case that we call create_image() above, the control
+         * Control returns here (1) after the image has been created or the
-         * returns here (1) after the image has been created or the
         * image creation has failed and (2) after a successful restore.
         */
+ Resume_devices:
        /* We may need to release the preallocated image pages here. */
        if (error || !in_suspend)
                swsusp_free();
@@ -383,35 +369,34 @@ int hibernation_snapshot(int platform_mode)
        if (error || !in_suspend)
                pm_restore_gfp_mask();
-        ftrace_start();
        resume_console();
+ Complete_devices:
        dpm_complete(msg);
 Close:
        platform_end(platform_mode);
        return error;
- Thaw:
+ Recover_platform:
-        thaw_kernel_threads();
+        platform_recover(platform_mode);
- Cleanup:
+        goto Resume_devices;
-        swsusp_free();
-        goto Close;
 }
 /**
 * resume_target_kernel - Restore system state from a hibernation image.
 * @platform_mode: Whether or not to use the platform driver.
 *
- * Execute device drivers' "noirq" and "late" freeze callbacks, restore the
+ * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
- * contents of highmem that have not been restored yet from the image and run
+ * highmem that have not been restored yet from the image and run the low-level
- * the low-level code that will restore the remaining contents of memory and
+ * code that will restore the remaining contents of memory and switch to the
- * switch to the just restored target kernel.
+ * just restored target kernel.
 */
 static int resume_target_kernel(bool platform_mode)
 {
        int error;
-        error = dpm_suspend_end(PMSG_QUIESCE);
+        error = dpm_suspend_noirq(PMSG_QUIESCE);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down, "
                        "aborting resume\n");
@@ -468,7 +453,7 @@ static int resume_target_kernel(bool platform_mode)
 Cleanup:
        platform_restore_cleanup(platform_mode);
-        dpm_resume_start(PMSG_RECOVER);
+        dpm_resume_noirq(PMSG_RECOVER);
        return error;
 }
@@ -478,7 +463,7 @@ static int resume_target_kernel(bool platform_mode)
 * @platform_mode: If set, use platform driver to prepare for the transition.
 *
 * This routine must be called with pm_mutex held.  If it is successful, control
- * reappears in the restored target kernel in hibernation_snapshot().
+ * reappears in the restored target kernel in hibernation_snaphot().
 */
 int hibernation_restore(int platform_mode)
 {
@@ -486,7 +471,6 @@ int hibernation_restore(int platform_mode)
        pm_prepare_console();
        suspend_console();
-        ftrace_stop();
        pm_restrict_gfp_mask();
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
@@ -494,7 +478,6 @@ int hibernation_restore(int platform_mode)
                dpm_resume_end(PMSG_RECOVER);
        }
        pm_restore_gfp_mask();
-        ftrace_start();
        resume_console();
        pm_restore_console();
        return error;
@@ -521,7 +504,6 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
-        ftrace_stop();
        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -529,7 +511,7 @@ int hibernation_platform_enter(void)
                goto Resume_devices;
        }
-        error = dpm_suspend_end(PMSG_HIBERNATE);
+        error = dpm_suspend_noirq(PMSG_HIBERNATE);
        if (error)
                goto Resume_devices;
@@ -560,12 +542,11 @@ int hibernation_platform_enter(void)
 Platform_finish:
        hibernation_ops->finish();
-        dpm_resume_start(PMSG_RESTORE);
+        dpm_resume_noirq(PMSG_RESTORE);
 Resume_devices:
        entering_platform_hibernation = false;
        dpm_resume_end(PMSG_RESTORE);
-        ftrace_start();
        resume_console();
 Close:
@@ -583,11 +564,10 @@ int hibernation_platform_enter(void)
 */
 static void power_down(void)
 {
-#ifdef CONFIG_SUSPEND
-        int error;
-#endif
        switch (hibernation_mode) {
+        case HIBERNATION_TEST:
+        case HIBERNATION_TESTPROC:
+                break;
        case HIBERNATION_REBOOT:
                kernel_restart(NULL);
                break;
@@ -596,25 +576,6 @@ static void power_down(void)
        case HIBERNATION_SHUTDOWN:
                kernel_power_off();
                break;
-#ifdef CONFIG_SUSPEND
-        case HIBERNATION_SUSPEND:
-                error = suspend_devices_and_enter(PM_SUSPEND_MEM);
-                if (error) {
-                        if (hibernation_ops)
-                                hibernation_mode = HIBERNATION_PLATFORM;
-                        else
-                                hibernation_mode = HIBERNATION_SHUTDOWN;
-                        power_down();
-                }
-                /*
-                 * Restore swap signature.
-                 */
-                error = swsusp_unmark();
-                if (error)
-                        printk(KERN_ERR "PM: Swap will be unusable! "
-                                        "Try swapon -a.\n");
-                return;
-#endif
        }
        kernel_halt();
        /*
@@ -625,6 +586,17 @@ static void power_down(void)
        while(1);
 }
+static int prepare_processes(void)
+{
+        int error = 0;
+        if (freeze_processes()) {
+                error = -EBUSY;
+                thaw_processes();
+        }
+        return error;
+}
 /**
 * hibernate - Carry out system hibernation, including saving the image.
 */
@@ -632,7 +604,7 @@ int hibernate(void)
 {
        int error;
-        lock_system_sleep();
+        mutex_lock(&pm_mutex);
        /* The snapshot device should not be opened while we're running */
        if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
                error = -EBUSY;
@@ -644,6 +616,10 @@ int hibernate(void)
        if (error)
                goto Exit;
+        error = usermodehelper_disable();
+        if (error)
+                goto Exit;
        /* Allocate memory management structures */
        error = create_basic_memory_bitmaps();
        if (error)
@@ -653,12 +629,18 @@ int hibernate(void)
        sys_sync();
        printk("done.\n");
-        error = freeze_processes();
+        error = prepare_processes();
        if (error)
-                goto Free_bitmaps;
+                goto Finish;
+        if (hibernation_test(TEST_FREEZER))
+                goto Thaw;
+        if (hibernation_testmode(HIBERNATION_TESTPROC))
+                goto Thaw;
        error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
-        if (error || freezer_test_done)
+        if (error)
                goto Thaw;
        if (in_suspend) {
@@ -668,9 +650,6 @@ int hibernate(void)
                        flags |= SF_PLATFORM_MODE;
                if (nocompress)
                        flags |= SF_NOCOMPRESS_MODE;
-                else
-                        flags |= SF_CRC32_MODE;
                pr_debug("PM: writing image.\n");
                error = swsusp_write(flags);
                swsusp_free();
@@ -684,18 +663,15 @@ int hibernate(void)
 Thaw:
        thaw_processes();
+ Finish:
-        /* Don't bother checking whether freezer_test_done is true */
-        freezer_test_done = false;
- Free_bitmaps:
        free_basic_memory_bitmaps();
+        usermodehelper_enable();
 Exit:
        pm_notifier_call_chain(PM_POST_HIBERNATION);
        pm_restore_console();
        atomic_inc(&snapshot_device_available);
 Unlock:
-        unlock_system_sleep();
+        mutex_unlock(&pm_mutex);
        return error;
 }
@@ -748,37 +724,20 @@ static int software_resume(void)
        pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
-        if (resume_delay) {
-                printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
-                        resume_delay);
-                ssleep(resume_delay);
-        }
        /* Check if the device is there */
        swsusp_resume_device = name_to_dev_t(resume_file);
-        /*
-         * name_to_dev_t is ineffective to verify parition if resume_file is in
-         * integer format. (e.g. major:minor)
-         */
-        if (isdigit(resume_file[0]) && resume_wait) {
-                int partno;
-                while (!get_gendisk(swsusp_resume_device, &partno))
-                        msleep(10);
-        }
        if (!swsusp_resume_device) {
                /*
                 * Some device discovery might still be in progress; we need
                 * to wait for this to finish.
                 */
                wait_for_device_probe();
+                /*
-                if (resume_wait) {
+                 * We can't depend on SCSI devices being available after loading
-                        while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
+                 * one of their modules until scsi_complete_async_scans() is
-                                msleep(10);
+                 * called and the resume device usually is a SCSI one.
-                        async_synchronize_full();
+                 */
-                }
+                scsi_complete_async_scans();
                swsusp_resume_device = name_to_dev_t(resume_file);
                if (!swsusp_resume_device) {
@@ -808,12 +767,16 @@ static int software_resume(void)
        if (error)
                goto close_finish;
+        error = usermodehelper_disable();
+        if (error)
+                goto close_finish;
        error = create_basic_memory_bitmaps();
        if (error)
                goto close_finish;
        pr_debug("PM: Preparing processes for restore.\n");
-        error = freeze_processes();
+        error = prepare_processes();
        if (error) {
                swsusp_close(FMODE_READ);
                goto Done;
@@ -831,6 +794,7 @@ static int software_resume(void)
        thaw_processes();
 Done:
        free_basic_memory_bitmaps();
+        usermodehelper_enable();
 Finish:
        pm_notifier_call_chain(PM_POST_RESTORE);
        pm_restore_console();
@@ -852,9 +816,8 @@ static const char * const hibernation_modes[] = {
        [HIBERNATION_PLATFORM]  = "platform",
        [HIBERNATION_SHUTDOWN]  = "shutdown",
        [HIBERNATION_REBOOT]    = "reboot",
-#ifdef CONFIG_SUSPEND
+        [HIBERNATION_TEST]      = "test",
-        [HIBERNATION_SUSPEND]   = "suspend",
+        [HIBERNATION_TESTPROC]  = "testproc",
-#endif
 };
 /*
@@ -863,15 +826,17 @@ static const char * const hibernation_modes[] = {
 * Hibernation can be handled in several ways.  There are a few different ways
 * to put the system into the sleep state: using the platform driver (e.g. ACPI
 * or other hibernation_ops), powering it off or rebooting it (for testing
- * mostly).
+ * mostly), or using one of the two available test modes.
 *
 * The sysfs file /sys/power/disk provides an interface for selecting the
 * hibernation mode to use.  Reading from this file causes the available modes
- * to be printed.  There are 3 modes that can be supported:
+ * to be printed.  There are 5 modes that can be supported:
 *
 *      'platform'
 *      'shutdown'
 *      'reboot'
+ *      'test'
+ *      'testproc'
 *
 * If a platform hibernation driver is in use, 'platform' will be supported
 * and will be used by default.  Otherwise, 'shutdown' will be used by default.
@@ -895,9 +860,8 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
                switch (i) {
                case HIBERNATION_SHUTDOWN:
                case HIBERNATION_REBOOT:
-#ifdef CONFIG_SUSPEND
+                case HIBERNATION_TEST:
-                case HIBERNATION_SUSPEND:
+                case HIBERNATION_TESTPROC:
-#endif
                        break;
                case HIBERNATION_PLATFORM:
                        if (hibernation_ops)
@@ -926,7 +890,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
-        lock_system_sleep();
+        mutex_lock(&pm_mutex);
        for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
                if (len == strlen(hibernation_modes[i])
                    && !strncmp(buf, hibernation_modes[i], len)) {
@@ -938,9 +902,8 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
                switch (mode) {
                case HIBERNATION_SHUTDOWN:
                case HIBERNATION_REBOOT:
-#ifdef CONFIG_SUSPEND
+                case HIBERNATION_TEST:
-                case HIBERNATION_SUSPEND:
+                case HIBERNATION_TESTPROC:
-#endif
                        hibernation_mode = mode;
                        break;
                case HIBERNATION_PLATFORM:
@@ -955,7 +918,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
        if (!error)
                pr_debug("PM: Hibernation mode set to '%s'\n",
                         hibernation_modes[mode]);
-        unlock_system_sleep();
+        mutex_unlock(&pm_mutex);
        return error ? error : n;
 }
@@ -982,9 +945,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
        if (maj != MAJOR(res) || min != MINOR(res))
                goto out;
-        lock_system_sleep();
+        mutex_lock(&pm_mutex);
        swsusp_resume_device = res;
-        unlock_system_sleep();
+        mutex_unlock(&pm_mutex);
        printk(KERN_INFO "PM: Starting manual resume from disk\n");
        noresume = 0;
        software_resume();
@@ -1097,21 +1060,7 @@ static int __init noresume_setup(char *str)
        return 1;
 }
-static int __init resumewait_setup(char *str)
-{
-        resume_wait = 1;
-        return 1;
-}
-static int __init resumedelay_setup(char *str)
-{
-        resume_delay = simple_strtoul(str, NULL, 0);
-        return 1;
-}
 __setup("noresume", noresume_setup);
 __setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
 __setup("hibernate=", hibernate_setup);
-__setup("resumewait", resumewait_setup);
-__setup("resumedelay=", resumedelay_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c16f9167de..3304594553c 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -3,18 +3,15 @@
 *
 * Copyright (c) 2003 Patrick Mochel
 * Copyright (c) 2003 Open Source Development Lab
- *
+ * 
 * This file is released under the GPLv2
 *
 */
-#include <linux/export.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
 #include <linux/resume-trace.h>
 #include <linux/workqueue.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
 #include "power.h"
@@ -59,7 +56,7 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
 {
        unsigned long val;
-        if (kstrtoul(buf, 10, &val))
+        if (strict_strtoul(buf, 10, &val))
                return -EINVAL;
        if (val > 1)
@@ -116,7 +113,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
-        lock_system_sleep();
+        mutex_lock(&pm_mutex);
        level = TEST_FIRST;
        for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -126,7 +123,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
                        break;
                }
-        unlock_system_sleep();
+        mutex_unlock(&pm_mutex);
        return error ? error : n;
 }
@@ -134,148 +131,8 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
 power_attr(pm_test);
 #endif /* CONFIG_PM_DEBUG */
-#ifdef CONFIG_DEBUG_FS
-static char *suspend_step_name(enum suspend_stat_step step)
-{
-        switch (step) {
-        case SUSPEND_FREEZE:
-                return "freeze";
-        case SUSPEND_PREPARE:
-                return "prepare";
-        case SUSPEND_SUSPEND:
-                return "suspend";
-        case SUSPEND_SUSPEND_NOIRQ:
-                return "suspend_noirq";
-        case SUSPEND_RESUME_NOIRQ:
-                return "resume_noirq";
-        case SUSPEND_RESUME:
-                return "resume";
-        default:
-                return "";
-        }
-}
-static int suspend_stats_show(struct seq_file *s, void *unused)
-{
-        int i, index, last_dev, last_errno, last_step;
-        last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
-        last_dev %= REC_FAILED_NUM;
-        last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1;
-        last_errno %= REC_FAILED_NUM;
-        last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
-        last_step %= REC_FAILED_NUM;
-        seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
-                        "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
-                        "success", suspend_stats.success,
-                        "fail", suspend_stats.fail,
-                        "failed_freeze", suspend_stats.failed_freeze,
-                        "failed_prepare", suspend_stats.failed_prepare,
-                        "failed_suspend", suspend_stats.failed_suspend,
-                        "failed_suspend_late",
-                                suspend_stats.failed_suspend_late,
-                        "failed_suspend_noirq",
-                                suspend_stats.failed_suspend_noirq,
-                        "failed_resume", suspend_stats.failed_resume,
-                        "failed_resume_early",
-                                suspend_stats.failed_resume_early,
-                        "failed_resume_noirq",
-                                suspend_stats.failed_resume_noirq);
-        seq_printf(s,   "failures:\n  last_failed_dev:\t%-s\n",
-                        suspend_stats.failed_devs[last_dev]);
-        for (i = 1; i < REC_FAILED_NUM; i++) {
-                index = last_dev + REC_FAILED_NUM - i;
-                index %= REC_FAILED_NUM;
-                seq_printf(s, "\t\t\t%-s\n",
-                        suspend_stats.failed_devs[index]);
-        }
-        seq_printf(s,   "  last_failed_errno:\t%-d\n",
-                        suspend_stats.errno[last_errno]);
-        for (i = 1; i < REC_FAILED_NUM; i++) {
-                index = last_errno + REC_FAILED_NUM - i;
-                index %= REC_FAILED_NUM;
-                seq_printf(s, "\t\t\t%-d\n",
-                        suspend_stats.errno[index]);
-        }
-        seq_printf(s,   "  last_failed_step:\t%-s\n",
-                        suspend_step_name(
-                                suspend_stats.failed_steps[last_step]));
-        for (i = 1; i < REC_FAILED_NUM; i++) {
-                index = last_step + REC_FAILED_NUM - i;
-                index %= REC_FAILED_NUM;
-                seq_printf(s, "\t\t\t%-s\n",
-                        suspend_step_name(
-                                suspend_stats.failed_steps[index]));
-        }
-        return 0;
-}
-static int suspend_stats_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, suspend_stats_show, NULL);
-}
-static const struct file_operations suspend_stats_operations = {
-        .open           = suspend_stats_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int __init pm_debugfs_init(void)
-{
-        debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO,
-                        NULL, NULL, &suspend_stats_operations);
-        return 0;
-}
-late_initcall(pm_debugfs_init);
-#endif /* CONFIG_DEBUG_FS */
 #endif /* CONFIG_PM_SLEEP */
-#ifdef CONFIG_PM_SLEEP_DEBUG
-/*
- * pm_print_times: print time taken by devices to suspend and resume.
- *
- * show() returns whether printing of suspend and resume times is enabled.
- * store() accepts 0 or 1.  0 disables printing and 1 enables it.
- */
-bool pm_print_times_enabled;
-static ssize_t pm_print_times_show(struct kobject *kobj,
-                                   struct kobj_attribute *attr, char *buf)
-{
-        return sprintf(buf, "%d\n", pm_print_times_enabled);
-}
-static ssize_t pm_print_times_store(struct kobject *kobj,
-                                    struct kobj_attribute *attr,
-                                    const char *buf, size_t n)
-{
-        unsigned long val;
-        if (kstrtoul(buf, 10, &val))
-                return -EINVAL;
-        if (val > 1)
-                return -EINVAL;
-        pm_print_times_enabled = !!val;
-        return n;
-}
-power_attr(pm_print_times);
-static inline void pm_print_times_init(void)
-{
-        pm_print_times_enabled = !!initcall_debug;
-}
-#else /* !CONFIG_PP_SLEEP_DEBUG */
-static inline void pm_print_times_init(void) {}
-#endif /* CONFIG_PM_SLEEP_DEBUG */
 struct kobject *power_kobj;
 /**
@@ -285,7 +142,7 @@ struct kobject *power_kobj;
 *      'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
 *      'disk' (Suspend-to-Disk).
 *
- *      store() accepts one of those strings, translates it into the
+ *      store() accepts one of those strings, translates it into the 
 *      proper enumerated value, and initiates a suspend transition.
 */
 static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -310,56 +167,47 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
        return (s - buf);
 }
-static suspend_state_t decode_state(const char *buf, size_t n)
+static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
+                           const char *buf, size_t n)
 {
 #ifdef CONFIG_SUSPEND
+#ifdef CONFIG_EARLYSUSPEND
+        suspend_state_t state = PM_SUSPEND_ON;
+#else
        suspend_state_t state = PM_SUSPEND_STANDBY;
+#endif
        const char * const *s;
 #endif
        char *p;
        int len;
+        int error = -EINVAL;
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
-        /* Check hibernation first. */
+        /* First, check if we are requested to hibernate */
-        if (len == 4 && !strncmp(buf, "disk", len))
+        if (len == 4 && !strncmp(buf, "disk", len)) {
-                return PM_SUSPEND_MAX;
+                error = hibernate();
+  goto Exit;
+        }
 #ifdef CONFIG_SUSPEND
-        for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
+        for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
                if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
-                        return state;
+                        break;
-#endif
-        return PM_SUSPEND_ON;
-}
-static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
-                           const char *buf, size_t n)
-{
-        suspend_state_t state;
-        int error;
-        error = pm_autosleep_lock();
-        if (error)
-                return error;
-        if (pm_autosleep_state() > PM_SUSPEND_ON) {
-                error = -EBUSY;
-                goto out;
        }
+        if (state < PM_SUSPEND_MAX && *s)
+#ifdef CONFIG_EARLYSUSPEND
+                if (state == PM_SUSPEND_ON || valid_state(state)) {
+                        error = 0;
+                        request_suspend_state(state);
+                }
+#else
+                error = enter_state(state);
+#endif
+#endif
-        state = decode_state(buf, n);
+ Exit:
-        if (state < PM_SUSPEND_MAX)
-                error = pm_suspend(state);
-        else if (state == PM_SUSPEND_MAX)
-                error = hibernate();
-        else
-                error = -EINVAL;
- out:
-        pm_autosleep_unlock();
        return error ? error : n;
 }
@@ -400,8 +248,7 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
 {
        unsigned int val;
-        return pm_get_wakeup_count(&val, true) ?
+        return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR;
-                sprintf(buf, "%u\n", val) : -EINTR;
 }
 static ssize_t wakeup_count_store(struct kobject *kobj,
@@ -409,106 +256,15 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
                                const char *buf, size_t n)
 {
        unsigned int val;
-        int error;
-        error = pm_autosleep_lock();
-        if (error)
-                return error;
-        if (pm_autosleep_state() > PM_SUSPEND_ON) {
-                error = -EBUSY;
-                goto out;
-        }
-        error = -EINVAL;
        if (sscanf(buf, "%u", &val) == 1) {
                if (pm_save_wakeup_count(val))
-                        error = n;
+                        return n;
        }
+        return -EINVAL;
- out:
-        pm_autosleep_unlock();
-        return error;
 }
 power_attr(wakeup_count);
-#ifdef CONFIG_PM_AUTOSLEEP
-static ssize_t autosleep_show(struct kobject *kobj,
-                              struct kobj_attribute *attr,
-                              char *buf)
-{
-        suspend_state_t state = pm_autosleep_state();
-        if (state == PM_SUSPEND_ON)
-                return sprintf(buf, "off\n");
-#ifdef CONFIG_SUSPEND
-        if (state < PM_SUSPEND_MAX)
-                return sprintf(buf, "%s\n", valid_state(state) ?
-                                                pm_states[state] : "error");
-#endif
-#ifdef CONFIG_HIBERNATION
-        return sprintf(buf, "disk\n");
-#else
-        return sprintf(buf, "error");
-#endif
-}
-static ssize_t autosleep_store(struct kobject *kobj,
-                               struct kobj_attribute *attr,
-                               const char *buf, size_t n)
-{
-        suspend_state_t state = decode_state(buf, n);
-        int error;
-        if (state == PM_SUSPEND_ON
-            && strcmp(buf, "off") && strcmp(buf, "off\n"))
-                return -EINVAL;
-        error = pm_autosleep_set_state(state);
-        return error ? error : n;
-}
-power_attr(autosleep);
-#endif /* CONFIG_PM_AUTOSLEEP */
-#ifdef CONFIG_PM_WAKELOCKS
-static ssize_t wake_lock_show(struct kobject *kobj,
-                              struct kobj_attribute *attr,
-                              char *buf)
-{
-        return pm_show_wakelocks(buf, true);
-}
-static ssize_t wake_lock_store(struct kobject *kobj,
-                               struct kobj_attribute *attr,
-                               const char *buf, size_t n)
-{
-        int error = pm_wake_lock(buf);
-        return error ? error : n;
-}
-power_attr(wake_lock);
-static ssize_t wake_unlock_show(struct kobject *kobj,
-                                struct kobj_attribute *attr,
-                                char *buf)
-{
-        return pm_show_wakelocks(buf, false);
-}
-static ssize_t wake_unlock_store(struct kobject *kobj,
-                                 struct kobj_attribute *attr,
-                                 const char *buf, size_t n)
-{
-        int error = pm_wake_unlock(buf);
-        return error ? error : n;
-}
-power_attr(wake_unlock);
-#endif /* CONFIG_PM_WAKELOCKS */
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_PM_TRACE
@@ -553,6 +309,11 @@ power_attr(pm_trace_dev_match);
 #endif /* CONFIG_PM_TRACE */
+#ifdef CONFIG_USER_WAKELOCK
+power_attr(wake_lock);
+power_attr(wake_unlock);
+#endif
 static struct attribute * g[] = {
        &state_attr.attr,
 #ifdef CONFIG_PM_TRACE
@@ -562,18 +323,12 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_SLEEP
        &pm_async_attr.attr,
        &wakeup_count_attr.attr,
-#ifdef CONFIG_PM_AUTOSLEEP
-        &autosleep_attr.attr,
-#endif
-#ifdef CONFIG_PM_WAKELOCKS
-        &wake_lock_attr.attr,
-        &wake_unlock_attr.attr,
-#endif
 #ifdef CONFIG_PM_DEBUG
        &pm_test_attr.attr,
 #endif
-#ifdef CONFIG_PM_SLEEP_DEBUG
+#ifdef CONFIG_USER_WAKELOCK
-        &pm_print_times_attr.attr,
+        &wake_lock_attr.attr,
+        &wake_unlock_attr.attr,
 #endif
 #endif
        NULL,
@@ -607,11 +362,7 @@ static int __init pm_init(void)
        power_kobj = kobject_create_and_add("power", NULL);
        if (!power_kobj)
                return -ENOMEM;
-        error = sysfs_create_group(power_kobj, &attr_group);
+        return sysfs_create_group(power_kobj, &attr_group);
-        if (error)
-                return error;
-        pm_print_times_init();
-        return pm_autosleep_init();
 }
 core_initcall(pm_init);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7d4b7ffb3c1..b6b9006480f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -50,8 +50,6 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 #define SPARE_PAGES     ((1024 * 1024) >> PAGE_SHIFT)
 /* kernel/power/hibernate.c */
-extern bool freezer_test_done;
 extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
@@ -148,7 +146,6 @@ extern int swsusp_swap_in_use(void);
 */
 #define SF_PLATFORM_MODE        1
 #define SF_NOCOMPRESS_MODE      2
-#define SF_CRC32_MODE           4
 /* kernel/power/hibernate.c */
 extern int swsusp_check(void);
@@ -156,9 +153,6 @@ extern void swsusp_free(void);
 extern int swsusp_read(unsigned int *flags_p);
 extern int swsusp_write(unsigned int flags);
 extern void swsusp_close(fmode_t);
-#ifdef CONFIG_SUSPEND
-extern int swsusp_unmark(void);
-#endif
 /* kernel/power/block_io.c */
 extern struct block_device *hib_resume_bdev;
@@ -180,11 +174,13 @@ extern const char *const pm_states[];
 extern bool valid_state(suspend_state_t state);
 extern int suspend_devices_and_enter(suspend_state_t state);
+extern int enter_state(suspend_state_t state);
 #else /* !CONFIG_SUSPEND */
 static inline int suspend_devices_and_enter(suspend_state_t state)
 {
        return -ENOSYS;
 }
+static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
 static inline bool valid_state(suspend_state_t state) { return false; }
 #endif /* !CONFIG_SUSPEND */
@@ -232,25 +228,7 @@ extern int pm_test_level;
 #ifdef CONFIG_SUSPEND_FREEZER
 static inline int suspend_freeze_processes(void)
 {
-        int error;
+        return freeze_processes();
-        error = freeze_processes();
-        /*
-         * freeze_processes() automatically thaws every task if freezing
-         * fails. So we need not do anything extra upon error.
-         */
-        if (error)
-                return error;
-        error = freeze_kernel_threads();
-        /*
-         * freeze_kernel_threads() thaws only kernel threads upon freezing
-         * failure. So we have to thaw the userspace tasks ourselves.
-         */
-        if (error)
-                thaw_processes();
-        return error;
 }
 static inline void suspend_thaw_processes(void)
@@ -268,29 +246,26 @@ static inline void suspend_thaw_processes(void)
 }
 #endif
-#ifdef CONFIG_PM_AUTOSLEEP
+#ifdef CONFIG_WAKELOCK
-/* kernel/power/autosleep.c */
-extern int pm_autosleep_init(void);
-extern int pm_autosleep_lock(void);
-extern void pm_autosleep_unlock(void);
-extern suspend_state_t pm_autosleep_state(void);
-extern int pm_autosleep_set_state(suspend_state_t state);
-#else /* !CONFIG_PM_AUTOSLEEP */
-static inline int pm_autosleep_init(void) { return 0; }
-static inline int pm_autosleep_lock(void) { return 0; }
-static inline void pm_autosleep_unlock(void) {}
-static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; }
-#endif /* !CONFIG_PM_AUTOSLEEP */
-#ifdef CONFIG_PM_WAKELOCKS
 /* kernel/power/wakelock.c */
-extern ssize_t pm_show_wakelocks(char *buf, bool show_active);
+extern struct workqueue_struct *suspend_work_queue;
-extern int pm_wake_lock(const char *buf);
+extern struct wake_lock main_wake_lock;
-extern int pm_wake_unlock(const char *buf);
+extern suspend_state_t requested_suspend_state;
+#endif
-#endif /* !CONFIG_PM_WAKELOCKS */
+#ifdef CONFIG_USER_WAKELOCK
+ssize_t wake_lock_show(struct kobject *kobj, struct kobj_attribute *attr,
+                        char *buf);
+ssize_t wake_lock_store(struct kobject *kobj, struct kobj_attribute *attr,
+                        const char *buf, size_t n);
+ssize_t wake_unlock_show(struct kobject *kobj, struct kobj_attribute *attr,
+                        char *buf);
+ssize_t  wake_unlock_store(struct kobject *kobj, struct kobj_attribute *attr,
+                        const char *buf, size_t n);
+#endif
+#ifdef CONFIG_EARLYSUSPEND
+/* kernel/power/earlysuspend.c */
+void request_suspend_state(suspend_state_t state);
+suspend_state_t get_suspend_state(void);
+#endif
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 68197a4e8fc..d52359374e8 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -37,7 +37,7 @@ static struct sysrq_key_op	sysrq_poweroff_op = {
        .enable_mask    = SYSRQ_ENABLE_BOOT,
 };
-static int __init pm_sysrq_init(void)
+static int pm_sysrq_init(void)
 {
        register_sysrq_key('o', &sysrq_poweroff_op);
        return 0;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d5a258b60c6..31338cdeafc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -16,14 +16,23 @@
 #include <linux/freezer.h>
 #include <linux/delay.h>
 #include <linux/workqueue.h>
-#include <linux/kmod.h>
+#include <linux/wakelock.h>
 /* 
 * Timeout for stopping processes
 */
 #define TIMEOUT (20 * HZ)
-static int try_to_freeze_tasks(bool user_only)
+static inline int freezable(struct task_struct * p)
+{
+        if ((p == current) ||
+            (p->flags & PF_NOFREEZE) ||
+            (p->exit_state != 0))
+                return 0;
+        return 1;
+}
+static int try_to_freeze_tasks(bool sig_only)
 {
        struct task_struct *g, *p;
        unsigned long end_time;
@@ -38,26 +47,46 @@ static int try_to_freeze_tasks(bool user_only)
        end_time = jiffies + TIMEOUT;
-        if (!user_only)
+        if (!sig_only)
                freeze_workqueues_begin();
        while (true) {
                todo = 0;
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
-                        if (p == current || !freeze_task(p))
+                        if (frozen(p) || !freezable(p))
+                                continue;
+                        if (!freeze_task(p, sig_only))
                                continue;
-                        if (!freezer_should_skip(p))
+                        /*
+                         * Now that we've done set_freeze_flag, don't
+                         * perturb a task in TASK_STOPPED or TASK_TRACED.
+                         * It is "frozen enough".  If the task does wake
+                         * up, it will immediately call try_to_freeze.
+                         *
+                         * Because freeze_task() goes through p's
+                         * scheduler lock after setting TIF_FREEZE, it's
+                         * guaranteed that either we see TASK_RUNNING or
+                         * try_to_stop() after schedule() in ptrace/signal
+                         * stop sees TIF_FREEZE.
+                         */
+                        if (!task_is_stopped_or_traced(p) &&
+                            !freezer_should_skip(p))
                                todo++;
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
-                if (!user_only) {
+                if (!sig_only) {
                        wq_busy = freeze_workqueues_busy();
                        todo += wq_busy;
                }
+                if (todo && has_wake_lock(WAKE_LOCK_SUSPEND)) {
+                        wakeup = 1;
+                        break;
+                }
                if (!todo || time_after(jiffies, end_time))
                        break;
@@ -68,7 +97,7 @@ static int try_to_freeze_tasks(bool user_only)
                /*
                 * We need to retry, but first give the freezing tasks some
-                 * time to enter the refrigerator.
+                 * time to enter the regrigerator.
                 */
                msleep(10);
        }
@@ -79,22 +108,35 @@ static int try_to_freeze_tasks(bool user_only)
        elapsed_csecs = elapsed_csecs64;
        if (todo) {
-                printk("\n");
+                /* This does not unfreeze processes that are already frozen
-                printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
+                 * (we have slightly ugly calling convention in that respect,
-                       "(%d tasks refusing to freeze, wq_busy=%d):\n",
+                 * and caller must call thaw_processes() if something fails),
-                       wakeup ? "aborted" : "failed",
+                 * but it cleans up leftover PF_FREEZE requests.
-                       elapsed_csecs / 100, elapsed_csecs % 100,
+                 */
-                       todo - wq_busy, wq_busy);
+                if(wakeup) {
+                        printk("\n");
-                if (!wakeup) {
+                        printk(KERN_ERR "Freezing of %s aborted\n",
-                        read_lock(&tasklist_lock);
+                                        sig_only ? "user space " : "tasks ");
-                        do_each_thread(g, p) {
-                                if (p != current && !freezer_should_skip(p)
-                                    && freezing(p) && !frozen(p))
-                                        sched_show_task(p);
-                        } while_each_thread(g, p);
-                        read_unlock(&tasklist_lock);
                }
+                else {
+                        printk("\n");
+                        printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
+                               "(%d tasks refusing to freeze, wq_busy=%d):\n",
+                               elapsed_csecs / 100, elapsed_csecs % 100,
+                               todo - wq_busy, wq_busy);
+                }
+                thaw_workqueues();
+                read_lock(&tasklist_lock);
+                do_each_thread(g, p) {
+                        task_lock(p);
+                        if (freezing(p) && !freezer_should_skip(p) &&
+                                elapsed_csecs > 100)
+                                sched_show_task(p);
+                        cancel_freezing(p);
+                        task_unlock(p);
+                } while_each_thread(g, p);
+                read_unlock(&tasklist_lock);
        } else {
                printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
                        elapsed_csecs % 100);
@@ -104,106 +146,61 @@ static int try_to_freeze_tasks(bool user_only)
 }
 /**
- * freeze_processes - Signal user space processes to enter the refrigerator.
+ *      freeze_processes - tell processes to enter the refrigerator
- *
- * On success, returns 0.  On failure, -errno and system is fully thawed.
 */
 int freeze_processes(void)
 {
        int error;
-        error = __usermodehelper_disable(UMH_FREEZING);
-        if (error)
-                return error;
-        if (!pm_freezing)
-                atomic_inc(&system_freezing_cnt);
        printk("Freezing user space processes ... ");
-        pm_freezing = true;
        error = try_to_freeze_tasks(true);
-        if (!error) {
-                printk("done.");
-                __usermodehelper_set_disable_depth(UMH_DISABLED);
-                oom_killer_disable();
-        }
-        printk("\n");
-        BUG_ON(in_atomic());
        if (error)
-                thaw_processes();
+                goto Exit;
-        return error;
+        printk("done.\n");
-}
-/**
- * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
- *
- * On success, returns 0.  On failure, -errno and only the kernel threads are
- * thawed, so as to give a chance to the caller to do additional cleanups
- * (if any) before thawing the userspace tasks. So, it is the responsibility
- * of the caller to thaw the userspace tasks, when the time is right.
- */
-int freeze_kernel_threads(void)
-{
-        int error;
        printk("Freezing remaining freezable tasks ... ");
-        pm_nosig_freezing = true;
        error = try_to_freeze_tasks(false);
-        if (!error)
+        if (error)
-                printk("done.");
+                goto Exit;
+        printk("done.");
-        printk("\n");
+        oom_killer_disable();
+ Exit:
        BUG_ON(in_atomic());
+        printk("\n");
-        if (error)
-                thaw_kernel_threads();
        return error;
 }
-void thaw_processes(void)
+static void thaw_tasks(bool nosig_only)
 {
        struct task_struct *g, *p;
-        if (pm_freezing)
+        read_lock(&tasklist_lock);
-                atomic_dec(&system_freezing_cnt);
+        do_each_thread(g, p) {
-        pm_freezing = false;
+                if (!freezable(p))
-        pm_nosig_freezing = false;
+                        continue;
-        oom_killer_enable();
-        printk("Restarting tasks ... ");
+                if (nosig_only && should_send_signal(p))
+                        continue;
-        thaw_workqueues();
+                if (cgroup_freezing_or_frozen(p))
+                        continue;
-        read_lock(&tasklist_lock);
+                thaw_process(p);
-        do_each_thread(g, p) {
-                __thaw_task(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
-        usermodehelper_enable();
-        schedule();
-        printk("done.\n");
 }
-void thaw_kernel_threads(void)
+void thaw_processes(void)
 {
-        struct task_struct *g, *p;
+        oom_killer_enable();
-        pm_nosig_freezing = false;
-        printk("Restarting kernel threads ... ");
+        printk("Restarting tasks ... ");
        thaw_workqueues();
+        thaw_tasks(true);
-        read_lock(&tasklist_lock);
+        thaw_tasks(false);
-        do_each_thread(g, p) {
-                if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
-                        __thaw_task(p);
-        } while_each_thread(g, p);
-        read_unlock(&tasklist_lock);
        schedule();
        printk("done.\n");
 }
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
deleted file mode 100644
index 9322ff7eaad..00000000000
--- a/kernel/power/qos.c
+++ /dev/null
@@ -1,602 +0,0 @@
-/*
- * This module exposes the interface to kernel space for specifying
- * QoS dependencies.  It provides infrastructure for registration of:
- *
- * Dependents on a QoS value : register requests
- * Watchers of QoS value : get notified when target QoS value changes
- *
- * This QoS design is best effort based.  Dependents register their QoS needs.
- * Watchers register to keep track of the current QoS needs of the system.
- *
- * There are 3 basic classes of QoS parameter: latency, timeout, throughput
- * each have defined units:
- * latency: usec
- * timeout: usec <-- currently not used.
- * throughput: kbs (kilo byte / sec)
- *
- * There are lists of pm_qos_objects each one wrapping requests, notifiers
- *
- * User mode requests on a QOS parameter register themselves to the
- * subsystem by opening the device node /dev/... and writing there request to
- * the node.  As long as the process holds a file handle open to the node the
- * client continues to be accounted for.  Upon file release the usermode
- * request is removed and a new qos target is computed.  This way when the
- * request that the application has is cleaned up when closes the file
- * pointer or exits the pm_qos_object will get an opportunity to clean up.
- *
- * Mark Gross <mgross@linux.intel.com>
- */
-/*#define DEBUG*/
-#include <linux/pm_qos.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/device.h>
-#include <linux/miscdevice.h>
-#include <linux/string.h>
-#include <linux/platform_device.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/uaccess.h>
-#include <linux/export.h>
-/*
- * locking rule: all changes to constraints or notifiers lists
- * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
- * held, taken with _irqsave.  One lock to rule them all
- */
-struct pm_qos_object {
-        struct pm_qos_constraints *constraints;
-        struct miscdevice pm_qos_power_miscdev;
-        char *name;
-};
-static DEFINE_SPINLOCK(pm_qos_lock);
-static struct pm_qos_object null_pm_qos;
-static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
-static struct pm_qos_constraints cpu_dma_constraints = {
-        .list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
-        .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
-        .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
-        .type = PM_QOS_MIN,
-        .notifiers = &cpu_dma_lat_notifier,
-};
-static struct pm_qos_object cpu_dma_pm_qos = {
-        .constraints = &cpu_dma_constraints,
-        .name = "cpu_dma_latency",
-};
-static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
-static struct pm_qos_constraints network_lat_constraints = {
-        .list = PLIST_HEAD_INIT(network_lat_constraints.list),
-        .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
-        .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
-        .type = PM_QOS_MIN,
-        .notifiers = &network_lat_notifier,
-};
-static struct pm_qos_object network_lat_pm_qos = {
-        .constraints = &network_lat_constraints,
-        .name = "network_latency",
-};
-static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
-static struct pm_qos_constraints network_tput_constraints = {
-        .list = PLIST_HEAD_INIT(network_tput_constraints.list),
-        .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
-        .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
-        .type = PM_QOS_MAX,
-        .notifiers = &network_throughput_notifier,
-};
-static struct pm_qos_object network_throughput_pm_qos = {
-        .constraints = &network_tput_constraints,
-        .name = "network_throughput",
-};
-static struct pm_qos_object *pm_qos_array[] = {
-        &null_pm_qos,
-        &cpu_dma_pm_qos,
-        &network_lat_pm_qos,
-        &network_throughput_pm_qos
-};
-static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
-                size_t count, loff_t *f_pos);
-static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
-                size_t count, loff_t *f_pos);
-static int pm_qos_power_open(struct inode *inode, struct file *filp);
-static int pm_qos_power_release(struct inode *inode, struct file *filp);
-static const struct file_operations pm_qos_power_fops = {
-        .write = pm_qos_power_write,
-        .read = pm_qos_power_read,
-        .open = pm_qos_power_open,
-        .release = pm_qos_power_release,
-        .llseek = noop_llseek,
-};
-/* unlocked internal variant */
-static inline int pm_qos_get_value(struct pm_qos_constraints *c)
-{
-        if (plist_head_empty(&c->list))
-                return c->default_value;
-        switch (c->type) {
-        case PM_QOS_MIN:
-                return plist_first(&c->list)->prio;
-        case PM_QOS_MAX:
-                return plist_last(&c->list)->prio;
-        default:
-                /* runtime check for not using enum */
-                BUG();
-                return PM_QOS_DEFAULT_VALUE;
-        }
-}
-s32 pm_qos_read_value(struct pm_qos_constraints *c)
-{
-        return c->target_value;
-}
-static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
-{
-        c->target_value = value;
-}
-/**
- * pm_qos_update_target - manages the constraints list and calls the notifiers
- *  if needed
- * @c: constraints data struct
- * @node: request to add to the list, to update or to remove
- * @action: action to take on the constraints list
- * @value: value of the request to add or update
- *
- * This function returns 1 if the aggregated constraint value has changed, 0
- *  otherwise.
- */
-int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
-                         enum pm_qos_req_action action, int value)
-{
-        unsigned long flags;
-        int prev_value, curr_value, new_value;
-        spin_lock_irqsave(&pm_qos_lock, flags);
-        prev_value = pm_qos_get_value(c);
-        if (value == PM_QOS_DEFAULT_VALUE)
-                new_value = c->default_value;
-        else
-                new_value = value;
-        switch (action) {
-        case PM_QOS_REMOVE_REQ:
-                plist_del(node, &c->list);
-                break;
-        case PM_QOS_UPDATE_REQ:
-                /*
-                 * to change the list, we atomically remove, reinit
-                 * with new value and add, then see if the extremal
-                 * changed
-                 */
-                plist_del(node, &c->list);
-        case PM_QOS_ADD_REQ:
-                plist_node_init(node, new_value);
-                plist_add(node, &c->list);
-                break;
-        default:
-                /* no action */
-                ;
-        }
-        curr_value = pm_qos_get_value(c);
-        pm_qos_set_value(c, curr_value);
-        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        if (prev_value != curr_value) {
-                blocking_notifier_call_chain(c->notifiers,
-                                             (unsigned long)curr_value,
-                                             NULL);
-                return 1;
-        } else {
-                return 0;
-        }
-}
-/**
- * pm_qos_flags_remove_req - Remove device PM QoS flags request.
- * @pqf: Device PM QoS flags set to remove the request from.
- * @req: Request to remove from the set.
- */
-static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf,
-                                    struct pm_qos_flags_request *req)
-{
-        s32 val = 0;
-        list_del(&req->node);
-        list_for_each_entry(req, &pqf->list, node)
-                val |= req->flags;
-        pqf->effective_flags = val;
-}
-/**
- * pm_qos_update_flags - Update a set of PM QoS flags.
- * @pqf: Set of flags to update.
- * @req: Request to add to the set, to modify, or to remove from the set.
- * @action: Action to take on the set.
- * @val: Value of the request to add or modify.
- *
- * Update the given set of PM QoS flags and call notifiers if the aggregate
- * value has changed.  Returns 1 if the aggregate constraint value has changed,
- * 0 otherwise.
- */
-bool pm_qos_update_flags(struct pm_qos_flags *pqf,
-                         struct pm_qos_flags_request *req,
-                         enum pm_qos_req_action action, s32 val)
-{
-        unsigned long irqflags;
-        s32 prev_value, curr_value;
-        spin_lock_irqsave(&pm_qos_lock, irqflags);
-        prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
-        switch (action) {
-        case PM_QOS_REMOVE_REQ:
-                pm_qos_flags_remove_req(pqf, req);
-                break;
-        case PM_QOS_UPDATE_REQ:
-                pm_qos_flags_remove_req(pqf, req);
-        case PM_QOS_ADD_REQ:
-                req->flags = val;
-                INIT_LIST_HEAD(&req->node);
-                list_add_tail(&req->node, &pqf->list);
-                pqf->effective_flags |= val;
-                break;
-        default:
-                /* no action */
-                ;
-        }
-        curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
-        spin_unlock_irqrestore(&pm_qos_lock, irqflags);
-        return prev_value != curr_value;
-}
-/**
- * pm_qos_request - returns current system wide qos expectation
- * @pm_qos_class: identification of which qos value is requested
- *
- * This function returns the current target value.
- */
-int pm_qos_request(int pm_qos_class)
-{
-        return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints);
-}
-EXPORT_SYMBOL_GPL(pm_qos_request);
-int pm_qos_request_active(struct pm_qos_request *req)
-{
-        return req->pm_qos_class != 0;
-}
-EXPORT_SYMBOL_GPL(pm_qos_request_active);
-/**
- * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
- * @work: work struct for the delayed work (timeout)
- *
- * This cancels the timeout request by falling back to the default at timeout.
- */
-static void pm_qos_work_fn(struct work_struct *work)
-{
-        struct pm_qos_request *req = container_of(to_delayed_work(work),
-                                                  struct pm_qos_request,
-                                                  work);
-        pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
-}
-/**
- * pm_qos_add_request - inserts new qos request into the list
- * @req: pointer to a preallocated handle
- * @pm_qos_class: identifies which list of qos request to use
- * @value: defines the qos request
- *
- * This function inserts a new entry in the pm_qos_class list of requested qos
- * performance characteristics.  It recomputes the aggregate QoS expectations
- * for the pm_qos_class of parameters and initializes the pm_qos_request
- * handle.  Caller needs to save this handle for later use in updates and
- * removal.
- */
-void pm_qos_add_request(struct pm_qos_request *req,
-                        int pm_qos_class, s32 value)
-{
-        if (!req) /*guard against callers passing in null */
-                return;
-        if (pm_qos_request_active(req)) {
-                WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
-                return;
-        }
-        req->pm_qos_class = pm_qos_class;
-        INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
-        pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
-                             &req->node, PM_QOS_ADD_REQ, value);
-}
-EXPORT_SYMBOL_GPL(pm_qos_add_request);
-/**
- * pm_qos_update_request - modifies an existing qos request
- * @req : handle to list element holding a pm_qos request to use
- * @value: defines the qos request
- *
- * Updates an existing qos request for the pm_qos_class of parameters along
- * with updating the target pm_qos_class value.
- *
- * Attempts are made to make this code callable on hot code paths.
- */
-void pm_qos_update_request(struct pm_qos_request *req,
-                           s32 new_value)
-{
-        if (!req) /*guard against callers passing in null */
-                return;
-        if (!pm_qos_request_active(req)) {
-                WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
-                return;
-        }
-        if (delayed_work_pending(&req->work))
-                cancel_delayed_work_sync(&req->work);
-        if (new_value != req->node.prio)
-                pm_qos_update_target(
-                        pm_qos_array[req->pm_qos_class]->constraints,
-                        &req->node, PM_QOS_UPDATE_REQ, new_value);
-}
-EXPORT_SYMBOL_GPL(pm_qos_update_request);
-/**
- * pm_qos_update_request_timeout - modifies an existing qos request temporarily.
- * @req : handle to list element holding a pm_qos request to use
- * @new_value: defines the temporal qos request
- * @timeout_us: the effective duration of this qos request in usecs.
- *
- * After timeout_us, this qos request is cancelled automatically.
- */
-void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
-                                   unsigned long timeout_us)
-{
-        if (!req)
-                return;
-        if (WARN(!pm_qos_request_active(req),
-                 "%s called for unknown object.", __func__))
-                return;
-        if (delayed_work_pending(&req->work))
-                cancel_delayed_work_sync(&req->work);
-        if (new_value != req->node.prio)
-                pm_qos_update_target(
-                        pm_qos_array[req->pm_qos_class]->constraints,
-                        &req->node, PM_QOS_UPDATE_REQ, new_value);
-        schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us));
-}
-/**
- * pm_qos_remove_request - modifies an existing qos request
- * @req: handle to request list element
- *
- * Will remove pm qos request from the list of constraints and
- * recompute the current target value for the pm_qos_class.  Call this
- * on slow code paths.
- */
-void pm_qos_remove_request(struct pm_qos_request *req)
-{
-        if (!req) /*guard against callers passing in null */
-                return;
-                /* silent return to keep pcm code cleaner */
-        if (!pm_qos_request_active(req)) {
-                WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
-                return;
-        }
-        if (delayed_work_pending(&req->work))
-                cancel_delayed_work_sync(&req->work);
-        pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
-                             &req->node, PM_QOS_REMOVE_REQ,
-                             PM_QOS_DEFAULT_VALUE);
-        memset(req, 0, sizeof(*req));
-}
-EXPORT_SYMBOL_GPL(pm_qos_remove_request);
-/**
- * pm_qos_add_notifier - sets notification entry for changes to target value
- * @pm_qos_class: identifies which qos target changes should be notified.
- * @notifier: notifier block managed by caller.
- *
- * will register the notifier into a notification chain that gets called
- * upon changes to the pm_qos_class target value.
- */
-int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
-{
-        int retval;
-        retval = blocking_notifier_chain_register(
-                        pm_qos_array[pm_qos_class]->constraints->notifiers,
-                        notifier);
-        return retval;
-}
-EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
-/**
- * pm_qos_remove_notifier - deletes notification entry from chain.
- * @pm_qos_class: identifies which qos target changes are notified.
- * @notifier: notifier block to be removed.
- *
- * will remove the notifier from the notification chain that gets called
- * upon changes to the pm_qos_class target value.
- */
-int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
-{
-        int retval;
-        retval = blocking_notifier_chain_unregister(
-                        pm_qos_array[pm_qos_class]->constraints->notifiers,
-                        notifier);
-        return retval;
-}
-EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
-/* User space interface to PM QoS classes via misc devices */
-static int register_pm_qos_misc(struct pm_qos_object *qos)
-{
-        qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
-        qos->pm_qos_power_miscdev.name = qos->name;
-        qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
-        return misc_register(&qos->pm_qos_power_miscdev);
-}
-static int find_pm_qos_object_by_minor(int minor)
-{
-        int pm_qos_class;
-        for (pm_qos_class = 0;
-                pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
-                if (minor ==
-                        pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
-                        return pm_qos_class;
-        }
-        return -1;
-}
-static int pm_qos_power_open(struct inode *inode, struct file *filp)
-{
-        long pm_qos_class;
-        pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
-        if (pm_qos_class >= 0) {
-                struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
-                if (!req)
-                        return -ENOMEM;
-                pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
-                filp->private_data = req;
-                return 0;
-        }
-        return -EPERM;
-}
-static int pm_qos_power_release(struct inode *inode, struct file *filp)
-{
-        struct pm_qos_request *req;
-        req = filp->private_data;
-        pm_qos_remove_request(req);
-        kfree(req);
-        return 0;
-}
-static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
-                size_t count, loff_t *f_pos)
-{
-        s32 value;
-        unsigned long flags;
-        struct pm_qos_request *req = filp->private_data;
-        if (!req)
-                return -EINVAL;
-        if (!pm_qos_request_active(req))
-                return -EINVAL;
-        spin_lock_irqsave(&pm_qos_lock, flags);
-        value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints);
-        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
-}
-static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
-                size_t count, loff_t *f_pos)
-{
-        s32 value;
-        struct pm_qos_request *req;
-        if (count == sizeof(s32)) {
-                if (copy_from_user(&value, buf, sizeof(s32)))
-                        return -EFAULT;
-        } else if (count <= 11) { /* ASCII perhaps? */
-                char ascii_value[11];
-                unsigned long int ulval;
-                int ret;
-                if (copy_from_user(ascii_value, buf, count))
-                        return -EFAULT;
-                if (count > 10) {
-                        if (ascii_value[10] == '\n')
-                                ascii_value[10] = '\0';
-                        else
-                                return -EINVAL;
-                } else {
-                        ascii_value[count] = '\0';
-                }
-                ret = kstrtoul(ascii_value, 16, &ulval);
-                if (ret) {
-                        pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
-                        return -EINVAL;
-                }
-                value = (s32)lower_32_bits(ulval);
-        } else {
-                return -EINVAL;
-        }
-        req = filp->private_data;
-        pm_qos_update_request(req, value);
-        return count;
-}
-static int __init pm_qos_power_init(void)
-{
-        int ret = 0;
-        int i;
-        BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
-        for (i = 1; i < PM_QOS_NUM_CLASSES; i++) {
-                ret = register_pm_qos_misc(pm_qos_array[i]);
-                if (ret < 0) {
-                        printk(KERN_ERR "pm_qos_param: %s setup failed\n",
-                               pm_qos_array[i]->name);
-                        return ret;
-                }
-        }
-        return ret;
-}
-late_initcall(pm_qos_power_init);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0de28576807..06efa54f93d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -711,10 +711,9 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
        list_for_each_entry(region, &nosave_regions, list) {
                unsigned long pfn;
-                pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
+                pr_debug("PM: Marking nosave pages: %016lx - %016lx\n",
-                         (unsigned long long) region->start_pfn << PAGE_SHIFT,
+                                region->start_pfn << PAGE_SHIFT,
-                         ((unsigned long long) region->end_pfn << PAGE_SHIFT)
+                                region->end_pfn << PAGE_SHIFT);
-                                - 1);
                for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
                        if (pfn_valid(pfn)) {
@@ -813,8 +812,7 @@ unsigned int snapshot_additional_pages(struct zone *zone)
        unsigned int res;
        res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
-        res += DIV_ROUND_UP(res * sizeof(struct bm_block),
+        res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
-                            LINKED_PAGE_DATA_SIZE);
        return 2 * res;
 }
@@ -860,9 +858,6 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
            PageReserved(page))
                return NULL;
-        if (page_is_guard(page))
-                return NULL;
        return page;
 }
@@ -925,9 +920,6 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
            && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
                return NULL;
-        if (page_is_guard(page))
-                return NULL;
        return page;
 }
@@ -1001,20 +993,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
        s_page = pfn_to_page(src_pfn);
        d_page = pfn_to_page(dst_pfn);
        if (PageHighMem(s_page)) {
-                src = kmap_atomic(s_page);
+                src = kmap_atomic(s_page, KM_USER0);
-                dst = kmap_atomic(d_page);
+                dst = kmap_atomic(d_page, KM_USER1);
                do_copy_page(dst, src);
-                kunmap_atomic(dst);
+                kunmap_atomic(dst, KM_USER1);
-                kunmap_atomic(src);
+                kunmap_atomic(src, KM_USER0);
        } else {
                if (PageHighMem(d_page)) {
                        /* Page pointed to by src may contain some kernel
                         * data modified by kmap_atomic()
                         */
                        safe_copy_page(buffer, s_page);
-                        dst = kmap_atomic(d_page);
+                        dst = kmap_atomic(d_page, KM_USER0);
                        copy_page(dst, buffer);
-                        kunmap_atomic(dst);
+                        kunmap_atomic(dst, KM_USER0);
                } else {
                        safe_copy_page(page_address(d_page), s_page);
                }
@@ -1347,9 +1339,6 @@ int hibernate_preallocate_memory(void)
        count += highmem;
        count -= totalreserve_pages;
-        /* Add number of pages required for page keys (s390 only). */
-        size += page_key_additional_pages(saveable);
        /* Compute the maximum number of saveable pages to leave in memory. */
        max_size = (count - (size + PAGES_FOR_IO)) / 2
                        - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
@@ -1673,8 +1662,6 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
                buf[j] = memory_bm_next_pfn(bm);
                if (unlikely(buf[j] == BM_END_OF_MAP))
                        break;
-                /* Save page key for data page (s390 only). */
-                page_key_read(buf + j);
        }
 }
@@ -1729,9 +1716,9 @@ int snapshot_read_next(struct snapshot_handle *handle)
                         */
                        void *kaddr;
-                        kaddr = kmap_atomic(page);
+                        kaddr = kmap_atomic(page, KM_USER0);
                        copy_page(buffer, kaddr);
-                        kunmap_atomic(kaddr);
+                        kunmap_atomic(kaddr, KM_USER0);
                        handle->buffer = buffer;
                } else {
                        handle->buffer = page_address(page);
@@ -1834,9 +1821,6 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
                if (unlikely(buf[j] == BM_END_OF_MAP))
                        break;
-                /* Extract and buffer page key for data page (s390 only). */
-                page_key_memorize(buf + j);
                if (memory_bm_pfn_present(bm, buf[j]))
                        memory_bm_set_bit(bm, buf[j]);
                else
@@ -2015,9 +1999,9 @@ static void copy_last_highmem_page(void)
        if (last_highmem_page) {
                void *dst;
-                dst = kmap_atomic(last_highmem_page);
+                dst = kmap_atomic(last_highmem_page, KM_USER0);
                copy_page(dst, buffer);
-                kunmap_atomic(dst);
+                kunmap_atomic(dst, KM_USER0);
                last_highmem_page = NULL;
        }
 }
@@ -2239,11 +2223,6 @@ int snapshot_write_next(struct snapshot_handle *handle)
                if (error)
                        return error;
-                /* Allocate buffer for page keys. */
-                error = page_key_alloc(nr_copy_pages);
-                if (error)
-                        return error;
        } else if (handle->cur <= nr_meta_pages + 1) {
                error = unpack_orig_pfns(buffer, &copy_bm);
                if (error)
@@ -2264,8 +2243,6 @@ int snapshot_write_next(struct snapshot_handle *handle)
                }
        } else {
                copy_last_highmem_page();
-                /* Restore page key for data page (s390 only). */
-                page_key_write(handle->buffer);
                handle->buffer = get_buffer(&orig_bm, &ca);
                if (IS_ERR(handle->buffer))
                        return PTR_ERR(handle->buffer);
@@ -2287,9 +2264,6 @@ int snapshot_write_next(struct snapshot_handle *handle)
 void snapshot_write_finalize(struct snapshot_handle *handle)
 {
        copy_last_highmem_page();
-        /* Restore page key for data page (s390 only). */
-        page_key_write(handle->buffer);
-        page_key_free();
        /* Free only if we have loaded the image entirely */
        if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
                memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
@@ -2310,13 +2284,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
 {
        void *kaddr1, *kaddr2;
-        kaddr1 = kmap_atomic(p1);
+        kaddr1 = kmap_atomic(p1, KM_USER0);
-        kaddr2 = kmap_atomic(p2);
+        kaddr2 = kmap_atomic(p2, KM_USER1);
        copy_page(buf, kaddr1);
        copy_page(kaddr1, kaddr2);
        copy_page(kaddr2, buf);
-        kunmap_atomic(kaddr2);
+        kunmap_atomic(kaddr2, KM_USER1);
-        kunmap_atomic(kaddr1);
+        kunmap_atomic(kaddr1, KM_USER0);
 }
 /**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27d..a6f6e3114a2 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -21,15 +21,16 @@
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/export.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
-#include <linux/ftrace.h>
 #include <trace/events/power.h>
 #include "power.h"
 const char *const pm_states[PM_SUSPEND_MAX] = {
+#ifdef CONFIG_EARLYSUSPEND
+        [PM_SUSPEND_ON]         = "on",
+#endif
        [PM_SUSPEND_STANDBY]    = "standby",
        [PM_SUSPEND_MEM]        = "mem",
 };
@@ -37,14 +38,14 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
 static const struct platform_suspend_ops *suspend_ops;
 /**
- * suspend_set_ops - Set the global suspend method table.
+ *      suspend_set_ops - Set the global suspend method table.
- * @ops: Suspend operations to use.
+ *      @ops:   Pointer to ops structure.
 */
 void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
-        lock_system_sleep();
+        mutex_lock(&pm_mutex);
        suspend_ops = ops;
-        unlock_system_sleep();
+        mutex_unlock(&pm_mutex);
 }
 EXPORT_SYMBOL_GPL(suspend_set_ops);
@@ -58,11 +59,11 @@ bool valid_state(suspend_state_t state)
 }
 /**
- * suspend_valid_only_mem - Generic memory-only valid callback.
+ * suspend_valid_only_mem - generic memory-only valid callback
 *
- * Platform drivers that implement mem suspend only and only need to check for
+ * Platform drivers that implement mem suspend only and only need
- * that in their .valid() callback can use this instead of rolling their own
+ * to check for that in their .valid callback can use this instead
- * .valid() callback.
+ * of rolling their own .valid callback.
 */
 int suspend_valid_only_mem(suspend_state_t state)
 {
@@ -83,11 +84,10 @@ static int suspend_test(int level)
 }
 /**
- * suspend_prepare - Prepare for entering system sleep state.
+ *      suspend_prepare - Do prep work before entering low-power state.
 *
- * Common code run for every system sleep state that can be entered (except for
+ *      This is common code that is called for each state that we're entering.
- * hibernation).  Run suspend notifiers, allocate the "suspend" console and
+ *      Run suspend notifiers, allocate a console and stop all processes.
- * freeze processes.
 */
 static int suspend_prepare(void)
 {
@@ -102,12 +102,16 @@ static int suspend_prepare(void)
        if (error)
                goto Finish;
+        error = usermodehelper_disable();
+        if (error)
+                goto Finish;
        error = suspend_freeze_processes();
        if (!error)
                return 0;
-        suspend_stats.failed_freeze++;
+        suspend_thaw_processes();
-        dpm_save_failed_step(SUSPEND_FREEZE);
+        usermodehelper_enable();
 Finish:
        pm_notifier_call_chain(PM_POST_SUSPEND);
        pm_restore_console();
@@ -127,9 +131,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
 }
 /**
- * suspend_enter - Make the system enter the given sleep state.
+ * suspend_enter - enter the desired system sleep state.
- * @state: System sleep state to enter.
+ * @state: State to enter
- * @wakeup: Returns information that the sleep state should not be re-entered.
+ * @wakeup: Returns information that suspend should not be entered again.
 *
 * This function should be called after devices have been suspended.
 */
@@ -143,7 +147,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
                        goto Platform_finish;
        }
-        error = dpm_suspend_end(PMSG_SUSPEND);
+        error = dpm_suspend_noirq(PMSG_SUSPEND);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down\n");
                goto Platform_finish;
@@ -185,7 +189,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        if (suspend_ops->wake)
                suspend_ops->wake();
-        dpm_resume_start(PMSG_RESUME);
+        dpm_resume_noirq(PMSG_RESUME);
 Platform_finish:
        if (suspend_ops->finish)
@@ -195,8 +199,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 }
 /**
- * suspend_devices_and_enter - Suspend devices and enter system sleep state.
+ *      suspend_devices_and_enter - suspend devices and enter the desired system
- * @state: System sleep state to enter.
+ *                                  sleep state.
+ *      @state:           state to enter
 */
 int suspend_devices_and_enter(suspend_state_t state)
 {
@@ -213,7 +218,6 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
-        ftrace_stop();
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -233,7 +237,6 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
-        ftrace_start();
        resume_console();
 Close:
        if (suspend_ops->end)
@@ -248,27 +251,30 @@ int suspend_devices_and_enter(suspend_state_t state)
 }
 /**
- * suspend_finish - Clean up before finishing the suspend sequence.
+ *      suspend_finish - Do final work before exiting suspend sequence.
 *
- * Call platform code to clean up, restart processes, and free the console that
+ *      Call platform code to clean up, restart processes, and free the
- * we've allocated. This routine is not called for hibernation.
+ *      console that we've allocated. This is not called for suspend-to-disk.
 */
 static void suspend_finish(void)
 {
        suspend_thaw_processes();
+        usermodehelper_enable();
        pm_notifier_call_chain(PM_POST_SUSPEND);
        pm_restore_console();
 }
 /**
- * enter_state - Do common work needed to enter system sleep state.
+ *      enter_state - Do common work of entering low-power state.
- * @state: System sleep state to enter.
+ *      @state:         pm_state structure for state we're entering.
 *
- * Make sure that no one else is trying to put the system into a sleep state.
+ *      Make sure we're the only ones trying to enter a sleep state. Fail
- * Fail if that's not the case.  Otherwise, prepare for system suspend, make the
+ *      if someone has beat us to it, since we don't want anything weird to
- * system enter the given sleep state and clean up after wakeup.
+ *      happen when we wake up.
+ *      Then, do the setup for suspend, enter the state, and cleaup (after
+ *      we've woken up).
 */
-static int enter_state(suspend_state_t state)
+int enter_state(suspend_state_t state)
 {
        int error;
@@ -304,26 +310,16 @@ static int enter_state(suspend_state_t state)
 }
 /**
- * pm_suspend - Externally visible function for suspending the system.
+ *      pm_suspend - Externally visible function for suspending system.
- * @state: System sleep state to enter.
+ *      @state:         Enumerated value of state to enter.
 *
- * Check if the value of @state represents one of the supported states,
+ *      Determine whether or not value is within range, get state
- * execute enter_state() and update system suspend statistics.
+ *      structure, and enter (above).
 */
 int pm_suspend(suspend_state_t state)
 {
-        int error;
+        if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX)
+                return enter_state(state);
-        if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
+        return -EINVAL;
-                return -EINVAL;
-        error = enter_state(state);
-        if (error) {
-                suspend_stats.fail++;
-                dpm_save_failed_errno(error);
-        } else {
-                suspend_stats.success++;
-        }
-        return error;
 }
 EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7c33ed20041..7c97c3a0eee 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,7 +6,7 @@
 *
 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
+ * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
 *
 * This file is released under the GPLv2.
 *
@@ -18,6 +18,7 @@
 #include <linux/bitops.h>
 #include <linux/genhd.h>
 #include <linux/device.h>
+#include <linux/buffer_head.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
@@ -26,10 +27,6 @@
 #include <linux/slab.h>
 #include <linux/lzo.h>
 #include <linux/vmalloc.h>
-#include <linux/cpumask.h>
-#include <linux/atomic.h>
-#include <linux/kthread.h>
-#include <linux/crc32.h>
 #include "power.h"
@@ -46,38 +43,17 @@
 *      allocated and populated one at a time, so we only need one memory
 *      page to set up the entire structure.
 *
- *      During resume we pick up all swap_map_page structures into a list.
+ *      During resume we also only need to use one swap_map_page structure
+ *      at a time.
 */
 #define MAP_PAGE_ENTRIES        (PAGE_SIZE / sizeof(sector_t) - 1)
-/*
- * Number of free pages that are not high.
- */
-static inline unsigned long low_free_pages(void)
-{
-        return nr_free_pages() - nr_free_highpages();
-}
-/*
- * Number of pages required to be kept free while writing the image. Always
- * half of all available low pages before the writing starts.
- */
-static inline unsigned long reqd_free_pages(void)
-{
-        return low_free_pages() / 2;
-}
 struct swap_map_page {
        sector_t entries[MAP_PAGE_ENTRIES];
        sector_t next_swap;
 };
-struct swap_map_page_list {
-        struct swap_map_page *map;
-        struct swap_map_page_list *next;
-};
 /**
 *      The swap_map_handle structure is used for handling swap in
 *      a file-alike way
@@ -85,18 +61,13 @@ struct swap_map_page_list {
 struct swap_map_handle {
        struct swap_map_page *cur;
-        struct swap_map_page_list *maps;
        sector_t cur_swap;
        sector_t first_sector;
        unsigned int k;
-        unsigned long reqd_free_pages;
-        u32 crc32;
 };
 struct swsusp_header {
-        char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
+        char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
-                      sizeof(u32)];
-        u32     crc32;
        sector_t image;
        unsigned int flags;     /* Flags to pass to the "boot" kernel */
        char    orig_sig[10];
@@ -126,7 +97,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
        /* Figure out where to put the new node */
        while (*new) {
-                ext = rb_entry(*new, struct swsusp_extent, node);
+                ext = container_of(*new, struct swsusp_extent, node);
                parent = *new;
                if (swap_offset < ext->start) {
                        /* Try to merge */
@@ -228,8 +199,6 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
                memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
                swsusp_header->image = handle->first_sector;
                swsusp_header->flags = flags;
-                if (flags & SF_CRC32_MODE)
-                        swsusp_header->crc32 = handle->crc32;
                error = hib_bio_write_page(swsusp_resume_block,
                                        swsusp_header, NULL);
        } else {
@@ -276,30 +245,18 @@ static int swsusp_swap_check(void)
 static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 {
        void *src;
-        int ret;
        if (!offset)
                return -ENOSPC;
        if (bio_chain) {
-                src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
+                src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
-                                              __GFP_NORETRY);
                if (src) {
                        copy_page(src, buf);
                } else {
-                        ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
+                        WARN_ON_ONCE(1);
-                        if (ret)
+                        bio_chain = NULL;       /* Go synchronous */
-                                return ret;
+                        src = buf;
-                        src = (void *)__get_free_page(__GFP_WAIT |
-                                                      __GFP_NOWARN |
-                                                      __GFP_NORETRY);
-                        if (src) {
-                                copy_page(src, buf);
-                        } else {
-                                WARN_ON_ONCE(1);
-                                bio_chain = NULL;       /* Go synchronous */
-                                src = buf;
-                        }
                }
        } else {
                src = buf;
@@ -336,7 +293,6 @@ static int get_swap_writer(struct swap_map_handle *handle)
                goto err_rel;
        }
        handle->k = 0;
-        handle->reqd_free_pages = reqd_free_pages();
        handle->first_sector = handle->cur_swap;
        return 0;
 err_rel:
@@ -360,27 +316,19 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
                return error;
        handle->cur->entries[handle->k++] = offset;
        if (handle->k >= MAP_PAGE_ENTRIES) {
+                error = hib_wait_on_bio_chain(bio_chain);
+                if (error)
+                        goto out;
                offset = alloc_swapdev_block(root_swap);
                if (!offset)
                        return -ENOSPC;
                handle->cur->next_swap = offset;
-                error = write_page(handle->cur, handle->cur_swap, bio_chain);
+                error = write_page(handle->cur, handle->cur_swap, NULL);
                if (error)
                        goto out;
                clear_page(handle->cur);
                handle->cur_swap = offset;
                handle->k = 0;
-                if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
-                        error = hib_wait_on_bio_chain(bio_chain);
-                        if (error)
-                                goto out;
-                        /*
-                         * Recalculate the number of required free pages, to
-                         * make sure we never take more than half.
-                         */
-                        handle->reqd_free_pages = reqd_free_pages();
-                }
        }
 out:
        return error;
@@ -424,14 +372,6 @@ static int swap_writer_finish(struct swap_map_handle *handle,
                                     LZO_HEADER, PAGE_SIZE)
 #define LZO_CMP_SIZE    (LZO_CMP_PAGES * PAGE_SIZE)
-/* Maximum number of threads for compression/decompression. */
-#define LZO_THREADS     3
-/* Minimum/maximum number of pages for read buffering. */
-#define LZO_MIN_RD_PAGES        1024
-#define LZO_MAX_RD_PAGES        8192
 /**
 *      save_image - save the suspend image data
 */
@@ -448,9 +388,9 @@ static int save_image(struct swap_map_handle *handle,
        struct timeval start;
        struct timeval stop;
-        printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
+        printk(KERN_INFO "PM: Saving image data pages (%u pages) ...     ",
                nr_to_write);
-        m = nr_to_write / 10;
+        m = nr_to_write / 100;
        if (!m)
                m = 1;
        nr_pages = 0;
@@ -464,8 +404,7 @@ static int save_image(struct swap_map_handle *handle,
                if (ret)
                        break;
                if (!(nr_pages % m))
-                        printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
+                        printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
-                               nr_pages / m * 10);
                nr_pages++;
        }
        err2 = hib_wait_on_bio_chain(&bio);
@@ -473,97 +412,13 @@ static int save_image(struct swap_map_handle *handle,
        if (!ret)
                ret = err2;
        if (!ret)
-                printk(KERN_INFO "PM: Image saving done.\n");
+                printk(KERN_CONT "\b\b\b\bdone\n");
+        else
+                printk(KERN_CONT "\n");
        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
        return ret;
 }
-/**
- * Structure used for CRC32.
- */
-struct crc_data {
-        struct task_struct *thr;                  /* thread */
-        atomic_t ready;                           /* ready to start flag */
-        atomic_t stop;                            /* ready to stop flag */
-        unsigned run_threads;                     /* nr current threads */
-        wait_queue_head_t go;                     /* start crc update */
-        wait_queue_head_t done;                   /* crc update done */
-        u32 *crc32;                               /* points to handle's crc32 */
-        size_t *unc_len[LZO_THREADS];             /* uncompressed lengths */
-        unsigned char *unc[LZO_THREADS];          /* uncompressed data */
-};
-/**
- * CRC32 update function that runs in its own thread.
- */
-static int crc32_threadfn(void *data)
-{
-        struct crc_data *d = data;
-        unsigned i;
-        while (1) {
-                wait_event(d->go, atomic_read(&d->ready) ||
-                                  kthread_should_stop());
-                if (kthread_should_stop()) {
-                        d->thr = NULL;
-                        atomic_set(&d->stop, 1);
-                        wake_up(&d->done);
-                        break;
-                }
-                atomic_set(&d->ready, 0);
-                for (i = 0; i < d->run_threads; i++)
-                        *d->crc32 = crc32_le(*d->crc32,
-                                             d->unc[i], *d->unc_len[i]);
-                atomic_set(&d->stop, 1);
-                wake_up(&d->done);
-        }
-        return 0;
-}
-/**
- * Structure used for LZO data compression.
- */
-struct cmp_data {
-        struct task_struct *thr;                  /* thread */
-        atomic_t ready;                           /* ready to start flag */
-        atomic_t stop;                            /* ready to stop flag */
-        int ret;                                  /* return code */
-        wait_queue_head_t go;                     /* start compression */
-        wait_queue_head_t done;                   /* compression done */
-        size_t unc_len;                           /* uncompressed length */
-        size_t cmp_len;                           /* compressed length */
-        unsigned char unc[LZO_UNC_SIZE];          /* uncompressed buffer */
-        unsigned char cmp[LZO_CMP_SIZE];          /* compressed buffer */
-        unsigned char wrk[LZO1X_1_MEM_COMPRESS];  /* compression workspace */
-};
-/**
- * Compression function that runs in its own thread.
- */
-static int lzo_compress_threadfn(void *data)
-{
-        struct cmp_data *d = data;
-        while (1) {
-                wait_event(d->go, atomic_read(&d->ready) ||
-                                  kthread_should_stop());
-                if (kthread_should_stop()) {
-                        d->thr = NULL;
-                        d->ret = -1;
-                        atomic_set(&d->stop, 1);
-                        wake_up(&d->done);
-                        break;
-                }
-                atomic_set(&d->ready, 0);
-                d->ret = lzo1x_1_compress(d->unc, d->unc_len,
-                                          d->cmp + LZO_HEADER, &d->cmp_len,
-                                          d->wrk);
-                atomic_set(&d->stop, 1);
-                wake_up(&d->done);
-        }
-        return 0;
-}
 /**
 * save_image_lzo - Save the suspend image data compressed with LZO.
@@ -582,179 +437,98 @@ static int save_image_lzo(struct swap_map_handle *handle,
        struct bio *bio;
        struct timeval start;
        struct timeval stop;
-        size_t off;
+        size_t off, unc_len, cmp_len;
-        unsigned thr, run_threads, nr_threads;
+        unsigned char *unc, *cmp, *wrk, *page;
-        unsigned char *page = NULL;
-        struct cmp_data *data = NULL;
-        struct crc_data *crc = NULL;
-        /*
-         * We'll limit the number of threads for compression to limit memory
-         * footprint.
-         */
-        nr_threads = num_online_cpus() - 1;
-        nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
        page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
        if (!page) {
                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
-                ret = -ENOMEM;
+                return -ENOMEM;
-                goto out_clean;
-        }
-        data = vmalloc(sizeof(*data) * nr_threads);
-        if (!data) {
-                printk(KERN_ERR "PM: Failed to allocate LZO data\n");
-                ret = -ENOMEM;
-                goto out_clean;
        }
-        for (thr = 0; thr < nr_threads; thr++)
-                memset(&data[thr], 0, offsetof(struct cmp_data, go));
-        crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+        wrk = vmalloc(LZO1X_1_MEM_COMPRESS);
-        if (!crc) {
+        if (!wrk) {
-                printk(KERN_ERR "PM: Failed to allocate crc\n");
+                printk(KERN_ERR "PM: Failed to allocate LZO workspace\n");
-                ret = -ENOMEM;
+                free_page((unsigned long)page);
-                goto out_clean;
+                return -ENOMEM;
-        }
-        memset(crc, 0, offsetof(struct crc_data, go));
-        /*
-         * Start the compression threads.
-         */
-        for (thr = 0; thr < nr_threads; thr++) {
-                init_waitqueue_head(&data[thr].go);
-                init_waitqueue_head(&data[thr].done);
-                data[thr].thr = kthread_run(lzo_compress_threadfn,
-                                            &data[thr],
-                                            "image_compress/%u", thr);
-                if (IS_ERR(data[thr].thr)) {
-                        data[thr].thr = NULL;
-                        printk(KERN_ERR
-                               "PM: Cannot start compression threads\n");
-                        ret = -ENOMEM;
-                        goto out_clean;
-                }
        }
-        /*
+        unc = vmalloc(LZO_UNC_SIZE);
-         * Start the CRC32 thread.
+        if (!unc) {
-         */
+                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
-        init_waitqueue_head(&crc->go);
+                vfree(wrk);
-        init_waitqueue_head(&crc->done);
+                free_page((unsigned long)page);
+                return -ENOMEM;
-        handle->crc32 = 0;
-        crc->crc32 = &handle->crc32;
-        for (thr = 0; thr < nr_threads; thr++) {
-                crc->unc[thr] = data[thr].unc;
-                crc->unc_len[thr] = &data[thr].unc_len;
        }
-        crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
+        cmp = vmalloc(LZO_CMP_SIZE);
-        if (IS_ERR(crc->thr)) {
+        if (!cmp) {
-                crc->thr = NULL;
+                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
-                printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+                vfree(unc);
-                ret = -ENOMEM;
+                vfree(wrk);
-                goto out_clean;
+                free_page((unsigned long)page);
+                return -ENOMEM;
        }
-        /*
-         * Adjust the number of required free pages after all allocations have
-         * been done. We don't want to run out of pages when writing.
-         */
-        handle->reqd_free_pages = reqd_free_pages();
        printk(KERN_INFO
-                "PM: Using %u thread(s) for compression.\n"
+                "PM: Compressing and saving image data (%u pages) ...     ",
-                "PM: Compressing and saving image data (%u pages)...\n",
+                nr_to_write);
-                nr_threads, nr_to_write);
+        m = nr_to_write / 100;
-        m = nr_to_write / 10;
        if (!m)
                m = 1;
        nr_pages = 0;
        bio = NULL;
        do_gettimeofday(&start);
        for (;;) {
-                for (thr = 0; thr < nr_threads; thr++) {
+                for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
-                        for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
+                        ret = snapshot_read_next(snapshot);
-                                ret = snapshot_read_next(snapshot);
+                        if (ret < 0)
-                                if (ret < 0)
+                                goto out_finish;
-                                        goto out_finish;
+                        if (!ret)
-                                if (!ret)
-                                        break;
-                                memcpy(data[thr].unc + off,
-                                       data_of(*snapshot), PAGE_SIZE);
-                                if (!(nr_pages % m))
-                                        printk(KERN_INFO
-                                               "PM: Image saving progress: "
-                                               "%3d%%\n",
-                                               nr_pages / m * 10);
-                                nr_pages++;
-                        }
-                        if (!off)
                                break;
-                        data[thr].unc_len = off;
+                        memcpy(unc + off, data_of(*snapshot), PAGE_SIZE);
-                        atomic_set(&data[thr].ready, 1);
+                        if (!(nr_pages % m))
-                        wake_up(&data[thr].go);
+                                printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
+                        nr_pages++;
                }
-                if (!thr)
+                if (!off)
                        break;
-                crc->run_threads = thr;
+                unc_len = off;
-                atomic_set(&crc->ready, 1);
+                ret = lzo1x_1_compress(unc, unc_len,
-                wake_up(&crc->go);
+                                       cmp + LZO_HEADER, &cmp_len, wrk);
+                if (ret < 0) {
+                        printk(KERN_ERR "PM: LZO compression failed\n");
+                        break;
+                }
-                for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
+                if (unlikely(!cmp_len ||
-                        wait_event(data[thr].done,
+                             cmp_len > lzo1x_worst_compress(unc_len))) {
-                                   atomic_read(&data[thr].stop));
+                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
-                        atomic_set(&data[thr].stop, 0);
+                        ret = -1;
+                        break;
+                }
-                        ret = data[thr].ret;
+                *(size_t *)cmp = cmp_len;
-                        if (ret < 0) {
+                /*
-                                printk(KERN_ERR "PM: LZO compression failed\n");
+                 * Given we are writing one page at a time to disk, we copy
-                                goto out_finish;
+                 * that much from the buffer, although the last bit will likely
-                        }
+                 * be smaller than full page. This is OK - we saved the length
+                 * of the compressed data, so any garbage at the end will be
+                 * discarded when we read it.
+                 */
+                for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
+                        memcpy(page, cmp + off, PAGE_SIZE);
-                        if (unlikely(!data[thr].cmp_len ||
+                        ret = swap_write_page(handle, page, &bio);
-                                     data[thr].cmp_len >
+                        if (ret)
-                                     lzo1x_worst_compress(data[thr].unc_len))) {
-                                printk(KERN_ERR
-                                       "PM: Invalid LZO compressed length\n");
-                                ret = -1;
                                goto out_finish;
-                        }
-                        *(size_t *)data[thr].cmp = data[thr].cmp_len;
-                        /*
-                         * Given we are writing one page at a time to disk, we
-                         * copy that much from the buffer, although the last
-                         * bit will likely be smaller than full page. This is
-                         * OK - we saved the length of the compressed data, so
-                         * any garbage at the end will be discarded when we
-                         * read it.
-                         */
-                        for (off = 0;
-                             off < LZO_HEADER + data[thr].cmp_len;
-                             off += PAGE_SIZE) {
-                                memcpy(page, data[thr].cmp + off, PAGE_SIZE);
-                                ret = swap_write_page(handle, page, &bio);
-                                if (ret)
-                                        goto out_finish;
-                        }
                }
-                wait_event(crc->done, atomic_read(&crc->stop));
-                atomic_set(&crc->stop, 0);
        }
 out_finish:
@@ -763,21 +537,15 @@ out_finish:
        if (!ret)
                ret = err2;
        if (!ret)
-                printk(KERN_INFO "PM: Image saving done.\n");
+                printk(KERN_CONT "\b\b\b\bdone\n");
+        else
+                printk(KERN_CONT "\n");
        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
-out_clean:
-        if (crc) {
+        vfree(cmp);
-                if (crc->thr)
+        vfree(unc);
-                        kthread_stop(crc->thr);
+        vfree(wrk);
-                kfree(crc);
+        free_page((unsigned long)page);
-        }
-        if (data) {
-                for (thr = 0; thr < nr_threads; thr++)
-                        if (data[thr].thr)
-                                kthread_stop(data[thr].thr);
-                vfree(data);
-        }
-        if (page) free_page((unsigned long)page);
        return ret;
 }
@@ -796,7 +564,8 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
        pr_debug("PM: Free swap pages: %u\n", free_swap);
-        required = PAGES_FOR_IO + nr_pages;
+        required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
+                nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
        return free_swap > required;
 }
@@ -824,12 +593,10 @@ int swsusp_write(unsigned int flags)
                printk(KERN_ERR "PM: Cannot get swap writer\n");
                return error;
        }
-        if (flags & SF_NOCOMPRESS_MODE) {
+        if (!enough_swap(pages, flags)) {
-                if (!enough_swap(pages, flags)) {
+                printk(KERN_ERR "PM: Not enough free swap\n");
-                        printk(KERN_ERR "PM: Not enough free swap\n");
+                error = -ENOSPC;
-                        error = -ENOSPC;
+                goto out_finish;
-                        goto out_finish;
-                }
        }
        memset(&snapshot, 0, sizeof(struct snapshot_handle));
        error = snapshot_read_next(&snapshot);
@@ -858,15 +625,8 @@ out_finish:
 static void release_swap_reader(struct swap_map_handle *handle)
 {
-        struct swap_map_page_list *tmp;
+        if (handle->cur)
+                free_page((unsigned long)handle->cur);
-        while (handle->maps) {
-                if (handle->maps->map)
-                        free_page((unsigned long)handle->maps->map);
-                tmp = handle->maps;
-                handle->maps = handle->maps->next;
-                kfree(tmp);
-        }
        handle->cur = NULL;
 }
@@ -874,46 +634,22 @@ static int get_swap_reader(struct swap_map_handle *handle,
                unsigned int *flags_p)
 {
        int error;
-        struct swap_map_page_list *tmp, *last;
-        sector_t offset;
        *flags_p = swsusp_header->flags;
        if (!swsusp_header->image) /* how can this happen? */
                return -EINVAL;
-        handle->cur = NULL;
+        handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
-        last = handle->maps = NULL;
+        if (!handle->cur)
-        offset = swsusp_header->image;
+                return -ENOMEM;
-        while (offset) {
-                tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL);
-                if (!tmp) {
-                        release_swap_reader(handle);
-                        return -ENOMEM;
-                }
-                memset(tmp, 0, sizeof(*tmp));
-                if (!handle->maps)
-                        handle->maps = tmp;
-                if (last)
-                        last->next = tmp;
-                last = tmp;
-                tmp->map = (struct swap_map_page *)
-                           __get_free_page(__GFP_WAIT | __GFP_HIGH);
-                if (!tmp->map) {
-                        release_swap_reader(handle);
-                        return -ENOMEM;
-                }
-                error = hib_bio_read_page(offset, tmp->map, NULL);
+        error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
-                if (error) {
+        if (error) {
-                        release_swap_reader(handle);
+                release_swap_reader(handle);
-                        return error;
+                return error;
-                }
-                offset = tmp->map->next_swap;
        }
        handle->k = 0;
-        handle->cur = handle->maps->map;
        return 0;
 }
@@ -922,7 +658,6 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 {
        sector_t offset;
        int error;
-        struct swap_map_page_list *tmp;
        if (!handle->cur)
                return -EINVAL;
@@ -933,15 +668,13 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
        if (error)
                return error;
        if (++handle->k >= MAP_PAGE_ENTRIES) {
+                error = hib_wait_on_bio_chain(bio_chain);
                handle->k = 0;
-                free_page((unsigned long)handle->maps->map);
+                offset = handle->cur->next_swap;
-                tmp = handle->maps;
+                if (!offset)
-                handle->maps = handle->maps->next;
-                kfree(tmp);
-                if (!handle->maps)
                        release_swap_reader(handle);
-                else
+                else if (!error)
-                        handle->cur = handle->maps->map;
+                        error = hib_bio_read_page(offset, handle->cur, NULL);
        }
        return error;
 }
@@ -964,93 +697,49 @@ static int load_image(struct swap_map_handle *handle,
                      unsigned int nr_to_read)
 {
        unsigned int m;
-        int ret = 0;
+        int error = 0;
        struct timeval start;
        struct timeval stop;
        struct bio *bio;
        int err2;
        unsigned nr_pages;
-        printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
+        printk(KERN_INFO "PM: Loading image data pages (%u pages) ...     ",
                nr_to_read);
-        m = nr_to_read / 10;
+        m = nr_to_read / 100;
        if (!m)
                m = 1;
        nr_pages = 0;
        bio = NULL;
        do_gettimeofday(&start);
        for ( ; ; ) {
-                ret = snapshot_write_next(snapshot);
+                error = snapshot_write_next(snapshot);
-                if (ret <= 0)
+                if (error <= 0)
                        break;
-                ret = swap_read_page(handle, data_of(*snapshot), &bio);
+                error = swap_read_page(handle, data_of(*snapshot), &bio);
-                if (ret)
+                if (error)
                        break;
                if (snapshot->sync_read)
-                        ret = hib_wait_on_bio_chain(&bio);
+                        error = hib_wait_on_bio_chain(&bio);
-                if (ret)
+                if (error)
                        break;
                if (!(nr_pages % m))
-                        printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
+                        printk("\b\b\b\b%3d%%", nr_pages / m);
-                               nr_pages / m * 10);
                nr_pages++;
        }
        err2 = hib_wait_on_bio_chain(&bio);
        do_gettimeofday(&stop);
-        if (!ret)
+        if (!error)
-                ret = err2;
+                error = err2;
-        if (!ret) {
+        if (!error) {
-                printk(KERN_INFO "PM: Image loading done.\n");
+                printk("\b\b\b\bdone\n");
                snapshot_write_finalize(snapshot);
                if (!snapshot_image_loaded(snapshot))
-                        ret = -ENODATA;
+                        error = -ENODATA;
-        }
+        } else
+                printk("\n");
        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
-        return ret;
+        return error;
-}
-/**
- * Structure used for LZO data decompression.
- */
-struct dec_data {
-        struct task_struct *thr;                  /* thread */
-        atomic_t ready;                           /* ready to start flag */
-        atomic_t stop;                            /* ready to stop flag */
-        int ret;                                  /* return code */
-        wait_queue_head_t go;                     /* start decompression */
-        wait_queue_head_t done;                   /* decompression done */
-        size_t unc_len;                           /* uncompressed length */
-        size_t cmp_len;                           /* compressed length */
-        unsigned char unc[LZO_UNC_SIZE];          /* uncompressed buffer */
-        unsigned char cmp[LZO_CMP_SIZE];          /* compressed buffer */
-};
-/**
- * Deompression function that runs in its own thread.
- */
-static int lzo_decompress_threadfn(void *data)
-{
-        struct dec_data *d = data;
-        while (1) {
-                wait_event(d->go, atomic_read(&d->ready) ||
-                                  kthread_should_stop());
-                if (kthread_should_stop()) {
-                        d->thr = NULL;
-                        d->ret = -1;
-                        atomic_set(&d->stop, 1);
-                        wake_up(&d->done);
-                        break;
-                }
-                atomic_set(&d->ready, 0);
-                d->unc_len = LZO_UNC_SIZE;
-                d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
-                                               d->unc, &d->unc_len);
-                atomic_set(&d->stop, 1);
-                wake_up(&d->done);
-        }
-        return 0;
 }
 /**
@@ -1064,319 +753,136 @@ static int load_image_lzo(struct swap_map_handle *handle,
                          unsigned int nr_to_read)
 {
        unsigned int m;
-        int ret = 0;
+        int error = 0;
-        int eof = 0;
        struct bio *bio;
        struct timeval start;
        struct timeval stop;
        unsigned nr_pages;
-        size_t off;
+        size_t i, off, unc_len, cmp_len;
-        unsigned i, thr, run_threads, nr_threads;
+        unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
-        unsigned ring = 0, pg = 0, ring_size = 0,
-                 have = 0, want, need, asked = 0;
-        unsigned long read_pages = 0;
-        unsigned char **page = NULL;
-        struct dec_data *data = NULL;
-        struct crc_data *crc = NULL;
-        /*
-         * We'll limit the number of threads for decompression to limit memory
-         * footprint.
-         */
-        nr_threads = num_online_cpus() - 1;
-        nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
-        page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
-        if (!page) {
-                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
-                ret = -ENOMEM;
-                goto out_clean;
-        }
-        data = vmalloc(sizeof(*data) * nr_threads);
+        for (i = 0; i < LZO_CMP_PAGES; i++) {
-        if (!data) {
+                page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
-                printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+                if (!page[i]) {
-                ret = -ENOMEM;
+                        printk(KERN_ERR "PM: Failed to allocate LZO page\n");
-                goto out_clean;
-        }
-        for (thr = 0; thr < nr_threads; thr++)
-                memset(&data[thr], 0, offsetof(struct dec_data, go));
-        crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+                        while (i)
-        if (!crc) {
+                                free_page((unsigned long)page[--i]);
-                printk(KERN_ERR "PM: Failed to allocate crc\n");
-                ret = -ENOMEM;
+                        return -ENOMEM;
-                goto out_clean;
-        }
-        memset(crc, 0, offsetof(struct crc_data, go));
-        /*
-         * Start the decompression threads.
-         */
-        for (thr = 0; thr < nr_threads; thr++) {
-                init_waitqueue_head(&data[thr].go);
-                init_waitqueue_head(&data[thr].done);
-                data[thr].thr = kthread_run(lzo_decompress_threadfn,
-                                            &data[thr],
-                                            "image_decompress/%u", thr);
-                if (IS_ERR(data[thr].thr)) {
-                        data[thr].thr = NULL;
-                        printk(KERN_ERR
-                               "PM: Cannot start decompression threads\n");
-                        ret = -ENOMEM;
-                        goto out_clean;
                }
        }
-        /*
+        unc = vmalloc(LZO_UNC_SIZE);
-         * Start the CRC32 thread.
+        if (!unc) {
-         */
+                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
-        init_waitqueue_head(&crc->go);
-        init_waitqueue_head(&crc->done);
-        handle->crc32 = 0;
-        crc->crc32 = &handle->crc32;
-        for (thr = 0; thr < nr_threads; thr++) {
-                crc->unc[thr] = data[thr].unc;
-                crc->unc_len[thr] = &data[thr].unc_len;
-        }
-        crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
+                for (i = 0; i < LZO_CMP_PAGES; i++)
-        if (IS_ERR(crc->thr)) {
+                        free_page((unsigned long)page[i]);
-                crc->thr = NULL;
-                printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+                return -ENOMEM;
-                ret = -ENOMEM;
-                goto out_clean;
        }
-        /*
+        cmp = vmalloc(LZO_CMP_SIZE);
-         * Set the number of pages for read buffering.
+        if (!cmp) {
-         * This is complete guesswork, because we'll only know the real
+                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
-         * picture once prepare_image() is called, which is much later on
-         * during the image load phase. We'll assume the worst case and
-         * say that none of the image pages are from high memory.
-         */
-        if (low_free_pages() > snapshot_get_image_size())
-                read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
-        read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
-        for (i = 0; i < read_pages; i++) {
-                page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
-                                                  __GFP_WAIT | __GFP_HIGH :
-                                                  __GFP_WAIT | __GFP_NOWARN |
-                                                  __GFP_NORETRY);
-                if (!page[i]) {
+                vfree(unc);
-                        if (i < LZO_CMP_PAGES) {
+                for (i = 0; i < LZO_CMP_PAGES; i++)
-                                ring_size = i;
+                        free_page((unsigned long)page[i]);
-                                printk(KERN_ERR
-                                       "PM: Failed to allocate LZO pages\n");
+                return -ENOMEM;
-                                ret = -ENOMEM;
-                                goto out_clean;
-                        } else {
-                                break;
-                        }
-                }
        }
-        want = ring_size = i;
        printk(KERN_INFO
-                "PM: Using %u thread(s) for decompression.\n"
+                "PM: Loading and decompressing image data (%u pages) ...     ",
-                "PM: Loading and decompressing image data (%u pages)...\n",
+                nr_to_read);
-                nr_threads, nr_to_read);
+        m = nr_to_read / 100;
-        m = nr_to_read / 10;
        if (!m)
                m = 1;
        nr_pages = 0;
        bio = NULL;
        do_gettimeofday(&start);
-        ret = snapshot_write_next(snapshot);
+        error = snapshot_write_next(snapshot);
-        if (ret <= 0)
+        if (error <= 0)
                goto out_finish;
-        for(;;) {
+        for (;;) {
-                for (i = 0; !eof && i < want; i++) {
+                error = swap_read_page(handle, page[0], NULL); /* sync */
-                        ret = swap_read_page(handle, page[ring], &bio);
+                if (error)
-                        if (ret) {
+                        break;
-                                /*
-                                 * On real read error, finish. On end of data,
-                                 * set EOF flag and just exit the read loop.
-                                 */
-                                if (handle->cur &&
-                                    handle->cur->entries[handle->k]) {
-                                        goto out_finish;
-                                } else {
-                                        eof = 1;
-                                        break;
-                                }
-                        }
-                        if (++ring >= ring_size)
-                                ring = 0;
-                }
-                asked += i;
-                want -= i;
-                /*
-                 * We are out of data, wait for some more.
-                 */
-                if (!have) {
-                        if (!asked)
-                                break;
-                        ret = hib_wait_on_bio_chain(&bio);
-                        if (ret)
-                                goto out_finish;
-                        have += asked;
-                        asked = 0;
-                        if (eof)
-                                eof = 2;
-                }
-                if (crc->run_threads) {
+                cmp_len = *(size_t *)page[0];
-                        wait_event(crc->done, atomic_read(&crc->stop));
+                if (unlikely(!cmp_len ||
-                        atomic_set(&crc->stop, 0);
+                             cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
-                        crc->run_threads = 0;
+                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
+                        error = -1;
+                        break;
                }
-                for (thr = 0; have && thr < nr_threads; thr++) {
+                for (off = PAGE_SIZE, i = 1;
-                        data[thr].cmp_len = *(size_t *)page[pg];
+                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
-                        if (unlikely(!data[thr].cmp_len ||
+                        error = swap_read_page(handle, page[i], &bio);
-                                     data[thr].cmp_len >
+                        if (error)
-                                     lzo1x_worst_compress(LZO_UNC_SIZE))) {
-                                printk(KERN_ERR
-                                       "PM: Invalid LZO compressed length\n");
-                                ret = -1;
                                goto out_finish;
-                        }
+                }
-                        need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER,
-                                            PAGE_SIZE);
-                        if (need > have) {
-                                if (eof > 1) {
-                                        ret = -1;
-                                        goto out_finish;
-                                }
-                                break;
-                        }
-                        for (off = 0;
+                error = hib_wait_on_bio_chain(&bio); /* need all data now */
-                             off < LZO_HEADER + data[thr].cmp_len;
+                if (error)
-                             off += PAGE_SIZE) {
+                        goto out_finish;
-                                memcpy(data[thr].cmp + off,
-                                       page[pg], PAGE_SIZE);
-                                have--;
-                                want++;
-                                if (++pg >= ring_size)
-                                        pg = 0;
-                        }
-                        atomic_set(&data[thr].ready, 1);
+                for (off = 0, i = 0;
-                        wake_up(&data[thr].go);
+                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
+                        memcpy(cmp + off, page[i], PAGE_SIZE);
                }
-                /*
+                unc_len = LZO_UNC_SIZE;
-                 * Wait for more data while we are decompressing.
+                error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len,
-                 */
+                                              unc, &unc_len);
-                if (have < LZO_CMP_PAGES && asked) {
+                if (error < 0) {
-                        ret = hib_wait_on_bio_chain(&bio);
+                        printk(KERN_ERR "PM: LZO decompression failed\n");
-                        if (ret)
+                        break;
-                                goto out_finish;
-                        have += asked;
-                        asked = 0;
-                        if (eof)
-                                eof = 2;
                }
-                for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
+                if (unlikely(!unc_len ||
-                        wait_event(data[thr].done,
+                             unc_len > LZO_UNC_SIZE ||
-                                   atomic_read(&data[thr].stop));
+                             unc_len & (PAGE_SIZE - 1))) {
-                        atomic_set(&data[thr].stop, 0);
+                        printk(KERN_ERR "PM: Invalid LZO uncompressed length\n");
+                        error = -1;
+                        break;
+                }
-                        ret = data[thr].ret;
+                for (off = 0; off < unc_len; off += PAGE_SIZE) {
+                        memcpy(data_of(*snapshot), unc + off, PAGE_SIZE);
-                        if (ret < 0) {
+                        if (!(nr_pages % m))
-                                printk(KERN_ERR
+                                printk("\b\b\b\b%3d%%", nr_pages / m);
-                                       "PM: LZO decompression failed\n");
+                        nr_pages++;
-                                goto out_finish;
-                        }
-                        if (unlikely(!data[thr].unc_len ||
+                        error = snapshot_write_next(snapshot);
-                                     data[thr].unc_len > LZO_UNC_SIZE ||
+                        if (error <= 0)
-                                     data[thr].unc_len & (PAGE_SIZE - 1))) {
-                                printk(KERN_ERR
-                                       "PM: Invalid LZO uncompressed length\n");
-                                ret = -1;
                                goto out_finish;
-                        }
-                        for (off = 0;
-                             off < data[thr].unc_len; off += PAGE_SIZE) {
-                                memcpy(data_of(*snapshot),
-                                       data[thr].unc + off, PAGE_SIZE);
-                                if (!(nr_pages % m))
-                                        printk(KERN_INFO
-                                               "PM: Image loading progress: "
-                                               "%3d%%\n",
-                                               nr_pages / m * 10);
-                                nr_pages++;
-                                ret = snapshot_write_next(snapshot);
-                                if (ret <= 0) {
-                                        crc->run_threads = thr + 1;
-                                        atomic_set(&crc->ready, 1);
-                                        wake_up(&crc->go);
-                                        goto out_finish;
-                                }
-                        }
                }
-                crc->run_threads = thr;
-                atomic_set(&crc->ready, 1);
-                wake_up(&crc->go);
        }
 out_finish:
-        if (crc->run_threads) {
-                wait_event(crc->done, atomic_read(&crc->stop));
-                atomic_set(&crc->stop, 0);
-        }
        do_gettimeofday(&stop);
-        if (!ret) {
+        if (!error) {
-                printk(KERN_INFO "PM: Image loading done.\n");
+                printk("\b\b\b\bdone\n");
                snapshot_write_finalize(snapshot);
                if (!snapshot_image_loaded(snapshot))
-                        ret = -ENODATA;
+                        error = -ENODATA;
-                if (!ret) {
+        } else
-                        if (swsusp_header->flags & SF_CRC32_MODE) {
+                printk("\n");
-                                if(handle->crc32 != swsusp_header->crc32) {
-                                        printk(KERN_ERR
-                                               "PM: Invalid image CRC32!\n");
-                                        ret = -ENODATA;
-                                }
-                        }
-                }
-        }
        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
-out_clean:
-        for (i = 0; i < ring_size; i++)
+        vfree(cmp);
+        vfree(unc);
+        for (i = 0; i < LZO_CMP_PAGES; i++)
                free_page((unsigned long)page[i]);
-        if (crc) {
-                if (crc->thr)
-                        kthread_stop(crc->thr);
-                kfree(crc);
-        }
-        if (data) {
-                for (thr = 0; thr < nr_threads; thr++)
-                        if (data[thr].thr)
-                                kthread_stop(data[thr].thr);
-                vfree(data);
-        }
-        if (page) vfree(page);
-        return ret;
+        return error;
 }
 /**
@@ -1472,34 +978,6 @@ void swsusp_close(fmode_t mode)
        blkdev_put(hib_resume_bdev, mode);
 }
-/**
- *      swsusp_unmark - Unmark swsusp signature in the resume device
- */
-#ifdef CONFIG_SUSPEND
-int swsusp_unmark(void)
-{
-        int error;
-        hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
-        if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
-                memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
-                error = hib_bio_write_page(swsusp_resume_block,
-                                        swsusp_header, NULL);
-        } else {
-                printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
-                error = -ENODEV;
-        }
-        /*
-         * We just returned from suspend, we don't need the image any more.
-         */
-        free_all_swap_pages(root_swap);
-        return error;
-}
-#endif
 static int swsusp_header_init(void)
 {
        swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 4ed81e74f86..42ddbc6f0de 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -20,15 +20,37 @@
 #include <linux/swapops.h>
 #include <linux/pm.h>
 #include <linux/fs.h>
-#include <linux/compat.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <scsi/scsi_scan.h>
 #include <asm/uaccess.h>
 #include "power.h"
+/*
+ * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and
+ * will be removed in the future.  They are only preserved here for
+ * compatibility with existing userland utilities.
+ */
+#define SNAPSHOT_SET_SWAP_FILE  _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
+#define SNAPSHOT_PMOPS          _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
+#define PMOPS_PREPARE   1
+#define PMOPS_ENTER     2
+#define PMOPS_FINISH    3
+/*
+ * NOTE: The following ioctl definitions are wrong and have been replaced with
+ * correct ones.  They are only preserved here for compatibility with existing
+ * userland utilities and will be removed in the future.
+ */
+#define SNAPSHOT_ATOMIC_SNAPSHOT        _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
+#define SNAPSHOT_SET_IMAGE_SIZE         _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
+#define SNAPSHOT_AVAIL_SWAP             _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
+#define SNAPSHOT_GET_SWAP_PAGE          _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
 #define SNAPSHOT_MINOR  231
@@ -48,7 +70,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        struct snapshot_data *data;
        int error;
-        lock_system_sleep();
+        mutex_lock(&pm_mutex);
        if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
                error = -EBUSY;
@@ -83,6 +105,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                 * appear.
                 */
                wait_for_device_probe();
+                scsi_complete_async_scans();
                data->swap = -1;
                data->mode = O_WRONLY;
@@ -99,7 +122,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        data->platform_support = 0;
 Unlock:
-        unlock_system_sleep();
+        mutex_unlock(&pm_mutex);
        return error;
 }
@@ -108,7 +131,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
 {
        struct snapshot_data *data;
-        lock_system_sleep();
+        mutex_lock(&pm_mutex);
        swsusp_free();
        free_basic_memory_bitmaps();
@@ -122,7 +145,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
                        PM_POST_HIBERNATION : PM_POST_RESTORE);
        atomic_inc(&snapshot_device_available);
-        unlock_system_sleep();
+        mutex_unlock(&pm_mutex);
        return 0;
 }
@@ -134,7 +157,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
        ssize_t res;
        loff_t pg_offp = *offp & ~PAGE_MASK;
-        lock_system_sleep();
+        mutex_lock(&pm_mutex);
        data = filp->private_data;
        if (!data->ready) {
@@ -155,7 +178,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
                *offp += res;
 Unlock:
-        unlock_system_sleep();
+        mutex_unlock(&pm_mutex);
        return res;
 }
@@ -167,7 +190,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
        ssize_t res;
        loff_t pg_offp = *offp & ~PAGE_MASK;
-        lock_system_sleep();
+        mutex_lock(&pm_mutex);
        data = filp->private_data;
@@ -184,11 +207,20 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
        if (res > 0)
                *offp += res;
 unlock:
-        unlock_system_sleep();
+        mutex_unlock(&pm_mutex);
        return res;
 }
+static void snapshot_deprecated_ioctl(unsigned int cmd)
+{
+        if (printk_ratelimit())
+                printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
+                                "be removed soon, update your suspend-to-disk "
+                                "utilities\n",
+                                __builtin_return_address(0), cmd);
+}
 static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                                                        unsigned long arg)
 {
@@ -219,7 +251,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                sys_sync();
                printk("done.\n");
+                error = usermodehelper_disable();
+                if (error)
+                        break;
                error = freeze_processes();
+                if (error) {
+                        thaw_processes();
+                        usermodehelper_enable();
+                }
                if (!error)
                        data->frozen = 1;
                break;
@@ -229,9 +269,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                        break;
                pm_restore_gfp_mask();
                thaw_processes();
+                usermodehelper_enable();
                data->frozen = 0;
                break;
+        case SNAPSHOT_ATOMIC_SNAPSHOT:
+                snapshot_deprecated_ioctl(cmd);
        case SNAPSHOT_CREATE_IMAGE:
                if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
                        error = -EPERM;
@@ -239,11 +282,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                }
                pm_restore_gfp_mask();
                error = hibernation_snapshot(data->platform_support);
-                if (!error) {
+                if (!error)
                        error = put_user(in_suspend, (int __user *)arg);
-                        data->ready = !freezer_test_done && !error;
+                if (!error)
-                        freezer_test_done = false;
+                        data->ready = 1;
-                }
                break;
        case SNAPSHOT_ATOMIC_RESTORE:
@@ -260,17 +302,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                swsusp_free();
                memset(&data->handle, 0, sizeof(struct snapshot_handle));
                data->ready = 0;
-                /*
-                 * It is necessary to thaw kernel threads here, because
-                 * SNAPSHOT_CREATE_IMAGE may be invoked directly after
-                 * SNAPSHOT_FREE.  In that case, if kernel threads were not
-                 * thawed, the preallocation of memory carried out by
-                 * hibernation_snapshot() might run into problems (i.e. it
-                 * might fail or even deadlock).
-                 */
-                thaw_kernel_threads();
                break;
+        case SNAPSHOT_SET_IMAGE_SIZE:
+                snapshot_deprecated_ioctl(cmd);
        case SNAPSHOT_PREF_IMAGE_SIZE:
                image_size = arg;
                break;
@@ -285,12 +320,16 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                error = put_user(size, (loff_t __user *)arg);
                break;
+        case SNAPSHOT_AVAIL_SWAP:
+                snapshot_deprecated_ioctl(cmd);
        case SNAPSHOT_AVAIL_SWAP_SIZE:
                size = count_swap_pages(data->swap, 1);
                size <<= PAGE_SHIFT;
                error = put_user(size, (loff_t __user *)arg);
                break;
+        case SNAPSHOT_GET_SWAP_PAGE:
+                snapshot_deprecated_ioctl(cmd);
        case SNAPSHOT_ALLOC_SWAP_PAGE:
                if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
                        error = -ENODEV;
@@ -313,6 +352,27 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                free_all_swap_pages(data->swap);
                break;
+        case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
+                snapshot_deprecated_ioctl(cmd);
+                if (!swsusp_swap_in_use()) {
+                        /*
+                         * User space encodes device types as two-byte values,
+                         * so we need to recode them
+                         */
+                        if (old_decode_dev(arg)) {
+                                data->swap = swap_type_of(old_decode_dev(arg),
+                                                        0, NULL);
+                                if (data->swap < 0)
+                                        error = -ENODEV;
+                        } else {
+                                data->swap = -1;
+                                error = -EINVAL;
+                        }
+                } else {
+                        error = -EPERM;
+                }
+                break;
        case SNAPSHOT_S2RAM:
                if (!data->frozen) {
                        error = -EPERM;
@@ -335,6 +395,33 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                        error = hibernation_platform_enter();
                break;
+        case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
+                snapshot_deprecated_ioctl(cmd);
+                error = -EINVAL;
+                switch (arg) {
+                case PMOPS_PREPARE:
+                        data->platform_support = 1;
+                        error = 0;
+                        break;
+                case PMOPS_ENTER:
+                        if (data->platform_support)
+                                error = hibernation_platform_enter();
+                        break;
+                case PMOPS_FINISH:
+                        if (data->platform_support)
+                                error = 0;
+                        break;
+                default:
+                        printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
+                }
+                break;
        case SNAPSHOT_SET_SWAP_AREA:
                if (swsusp_swap_in_use()) {
                        error = -EPERM;
@@ -376,66 +463,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
        return error;
 }
-#ifdef CONFIG_COMPAT
-struct compat_resume_swap_area {
-        compat_loff_t offset;
-        u32 dev;
-} __packed;
-static long
-snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-        BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t));
-        switch (cmd) {
-        case SNAPSHOT_GET_IMAGE_SIZE:
-        case SNAPSHOT_AVAIL_SWAP_SIZE:
-        case SNAPSHOT_ALLOC_SWAP_PAGE: {
-                compat_loff_t __user *uoffset = compat_ptr(arg);
-                loff_t offset;
-                mm_segment_t old_fs;
-                int err;
-                old_fs = get_fs();
-                set_fs(KERNEL_DS);
-                err = snapshot_ioctl(file, cmd, (unsigned long) &offset);
-                set_fs(old_fs);
-                if (!err && put_user(offset, uoffset))
-                        err = -EFAULT;
-                return err;
-        }
-        case SNAPSHOT_CREATE_IMAGE:
-                return snapshot_ioctl(file, cmd,
-                                      (unsigned long) compat_ptr(arg));
-        case SNAPSHOT_SET_SWAP_AREA: {
-                struct compat_resume_swap_area __user *u_swap_area =
-                        compat_ptr(arg);
-                struct resume_swap_area swap_area;
-                mm_segment_t old_fs;
-                int err;
-                err = get_user(swap_area.offset, &u_swap_area->offset);
-                err |= get_user(swap_area.dev, &u_swap_area->dev);
-                if (err)
-                        return -EFAULT;
-                old_fs = get_fs();
-                set_fs(KERNEL_DS);
-                err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
-                                     (unsigned long) &swap_area);
-                set_fs(old_fs);
-                return err;
-        }
-        default:
-                return snapshot_ioctl(file, cmd, arg);
-        }
-}
-#endif /* CONFIG_COMPAT */
 static const struct file_operations snapshot_fops = {
        .open = snapshot_open,
        .release = snapshot_release,
@@ -443,9 +470,6 @@ static const struct file_operations snapshot_fops = {
        .write = snapshot_write,
        .llseek = no_llseek,
        .unlocked_ioctl = snapshot_ioctl,
-#ifdef CONFIG_COMPAT
-        .compat_ioctl = snapshot_compat_ioctl,
-#endif
 };
 static struct miscdevice snapshot_device = {
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 8f50de394d2..81e1b7c65ca 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -1,266 +1,634 @@
-/*
+/* kernel/power/wakelock.c
- * kernel/power/wakelock.c
 *
- * User space wakeup sources support.
+ * Copyright (C) 2005-2008 Google, Inc.
 *
- * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
 *
- * This code is based on the analogous interface allowing user space to
- * manipulate wakelocks on Android.
 */
-#include <linux/capability.h>
+#include <linux/module.h>
-#include <linux/ctype.h>
+#include <linux/platform_device.h>
-#include <linux/device.h>
+#include <linux/rtc.h>
-#include <linux/err.h>
+#include <linux/suspend.h>
-#include <linux/hrtimer.h>
+#include <linux/syscalls.h> /* sys_sync */
-#include <linux/list.h>
+#include <linux/wakelock.h>
-#include <linux/rbtree.h>
+#ifdef CONFIG_WAKELOCK_STAT
-#include <linux/slab.h>
+#include <linux/proc_fs.h>
-static DEFINE_MUTEX(wakelocks_lock);
-struct wakelock {
-        char                    *name;
-        struct rb_node          node;
-        struct wakeup_source    ws;
-#ifdef CONFIG_PM_WAKELOCKS_GC
-        struct list_head        lru;
 #endif
+#include "power.h"
+enum {
+        DEBUG_EXIT_SUSPEND = 1U << 0,
+        DEBUG_WAKEUP = 1U << 1,
+        DEBUG_SUSPEND = 1U << 2,
+        DEBUG_EXPIRE = 1U << 3,
+        DEBUG_WAKE_LOCK = 1U << 4,
 };
+static int debug_mask = DEBUG_EXIT_SUSPEND | DEBUG_WAKEUP;
+module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP);
+#define WAKE_LOCK_TYPE_MASK              (0x0f)
+#define WAKE_LOCK_INITIALIZED            (1U << 8)
+#define WAKE_LOCK_ACTIVE                 (1U << 9)
+#define WAKE_LOCK_AUTO_EXPIRE            (1U << 10)
+#define WAKE_LOCK_PREVENTING_SUSPEND     (1U << 11)
+static DEFINE_SPINLOCK(list_lock);
+static LIST_HEAD(inactive_locks);
+static struct list_head active_wake_locks[WAKE_LOCK_TYPE_COUNT];
+static int current_event_num;
+struct workqueue_struct *suspend_work_queue;
+struct wake_lock main_wake_lock;
+suspend_state_t requested_suspend_state = PM_SUSPEND_MEM;
+static struct wake_lock unknown_wakeup;
+static struct wake_lock suspend_backoff_lock;
+#define SUSPEND_BACKOFF_THRESHOLD       10
+#define SUSPEND_BACKOFF_INTERVAL        10000
+static unsigned suspend_short_count;
+#ifdef CONFIG_WAKELOCK_STAT
+static struct wake_lock deleted_wake_locks;
+static ktime_t last_sleep_time_update;
+static int wait_for_wakeup;
+int get_expired_time(struct wake_lock *lock, ktime_t *expire_time)
+{
+        struct timespec ts;
+        struct timespec kt;
+        struct timespec tomono;
+        struct timespec delta;
+        struct timespec sleep;
+        long timeout;
+        if (!(lock->flags & WAKE_LOCK_AUTO_EXPIRE))
+                return 0;
+        get_xtime_and_monotonic_and_sleep_offset(&kt, &tomono, &sleep);
+        timeout = lock->expires - jiffies;
+        if (timeout > 0)
+                return 0;
+        jiffies_to_timespec(-timeout, &delta);
+        set_normalized_timespec(&ts, kt.tv_sec + tomono.tv_sec - delta.tv_sec,
+                                kt.tv_nsec + tomono.tv_nsec - delta.tv_nsec);
+        *expire_time = timespec_to_ktime(ts);
+        return 1;
+}
-static struct rb_root wakelocks_tree = RB_ROOT;
-ssize_t pm_show_wakelocks(char *buf, bool show_active)
+static int print_lock_stat(struct seq_file *m, struct wake_lock *lock)
 {
-        struct rb_node *node;
+        int lock_count = lock->stat.count;
-        struct wakelock *wl;
+        int expire_count = lock->stat.expire_count;
-        char *str = buf;
+        ktime_t active_time = ktime_set(0, 0);
-        char *end = buf + PAGE_SIZE;
+        ktime_t total_time = lock->stat.total_time;
+        ktime_t max_time = lock->stat.max_time;
+        ktime_t prevent_suspend_time = lock->stat.prevent_suspend_time;
+        if (lock->flags & WAKE_LOCK_ACTIVE) {
+                ktime_t now, add_time;
+                int expired = get_expired_time(lock, &now);
+                if (!expired)
+                        now = ktime_get();
+                add_time = ktime_sub(now, lock->stat.last_time);
+                lock_count++;
+                if (!expired)
+                        active_time = add_time;
+                else
+                        expire_count++;
+                total_time = ktime_add(total_time, add_time);
+                if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND)
+                        prevent_suspend_time = ktime_add(prevent_suspend_time,
+                                        ktime_sub(now, last_sleep_time_update));
+                if (add_time.tv64 > max_time.tv64)
+                        max_time = add_time;
+        }
-        mutex_lock(&wakelocks_lock);
+        return seq_printf(m,
+                     "\"%s\"\t%d\t%d\t%d\t%lld\t%lld\t%lld\t%lld\t%lld\n",
+                     lock->name, lock_count, expire_count,
+                     lock->stat.wakeup_count, ktime_to_ns(active_time),
+                     ktime_to_ns(total_time),
+                     ktime_to_ns(prevent_suspend_time), ktime_to_ns(max_time),
+                     ktime_to_ns(lock->stat.last_time));
+}
-        for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
+static int wakelock_stats_show(struct seq_file *m, void *unused)
-                wl = rb_entry(node, struct wakelock, node);
+{
-                if (wl->ws.active == show_active)
+        unsigned long irqflags;
-                        str += scnprintf(str, end - str, "%s ", wl->name);
+        struct wake_lock *lock;
+        int ret;
+        int type;
+        spin_lock_irqsave(&list_lock, irqflags);
+        ret = seq_puts(m, "name\tcount\texpire_count\twake_count\tactive_since"
+                        "\ttotal_time\tsleep_time\tmax_time\tlast_change\n");
+        list_for_each_entry(lock, &inactive_locks, link)
+                ret = print_lock_stat(m, lock);
+        for (type = 0; type < WAKE_LOCK_TYPE_COUNT; type++) {
+                list_for_each_entry(lock, &active_wake_locks[type], link)
+                        ret = print_lock_stat(m, lock);
        }
-        if (str > buf)
+        spin_unlock_irqrestore(&list_lock, irqflags);
-                str--;
+        return 0;
+}
-        str += scnprintf(str, end - str, "\n");
+static void wake_unlock_stat_locked(struct wake_lock *lock, int expired)
+{
+        ktime_t duration;
+        ktime_t now;
+        if (!(lock->flags & WAKE_LOCK_ACTIVE))
+                return;
+        if (get_expired_time(lock, &now))
+                expired = 1;
+        else
+                now = ktime_get();
+        lock->stat.count++;
+        if (expired)
+                lock->stat.expire_count++;
+        duration = ktime_sub(now, lock->stat.last_time);
+        lock->stat.total_time = ktime_add(lock->stat.total_time, duration);
+        if (ktime_to_ns(duration) > ktime_to_ns(lock->stat.max_time))
+                lock->stat.max_time = duration;
+        lock->stat.last_time = ktime_get();
+        if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) {
+                duration = ktime_sub(now, last_sleep_time_update);
+                lock->stat.prevent_suspend_time = ktime_add(
+                        lock->stat.prevent_suspend_time, duration);
+                lock->flags &= ~WAKE_LOCK_PREVENTING_SUSPEND;
+        }
+}
-        mutex_unlock(&wakelocks_lock);
+static void update_sleep_wait_stats_locked(int done)
-        return (str - buf);
+{
+        struct wake_lock *lock;
+        ktime_t now, etime, elapsed, add;
+        int expired;
+        now = ktime_get();
+        elapsed = ktime_sub(now, last_sleep_time_update);
+        list_for_each_entry(lock, &active_wake_locks[WAKE_LOCK_SUSPEND], link) {
+                expired = get_expired_time(lock, &etime);
+                if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) {
+                        if (expired)
+                                add = ktime_sub(etime, last_sleep_time_update);
+                        else
+                                add = elapsed;
+                        lock->stat.prevent_suspend_time = ktime_add(
+                                lock->stat.prevent_suspend_time, add);
+                }
+                if (done || expired)
+                        lock->flags &= ~WAKE_LOCK_PREVENTING_SUSPEND;
+                else
+                        lock->flags |= WAKE_LOCK_PREVENTING_SUSPEND;
+        }
+        last_sleep_time_update = now;
 }
+#endif
-#if CONFIG_PM_WAKELOCKS_LIMIT > 0
-static unsigned int number_of_wakelocks;
-static inline bool wakelocks_limit_exceeded(void)
+static void expire_wake_lock(struct wake_lock *lock)
 {
-        return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT;
+#ifdef CONFIG_WAKELOCK_STAT
+        wake_unlock_stat_locked(lock, 1);
+#endif
+        lock->flags &= ~(WAKE_LOCK_ACTIVE | WAKE_LOCK_AUTO_EXPIRE);
+        list_del(&lock->link);
+        list_add(&lock->link, &inactive_locks);
+        if (debug_mask & (DEBUG_WAKE_LOCK | DEBUG_EXPIRE))
+                pr_info("expired wake lock %s\n", lock->name);
 }
-static inline void increment_wakelocks_number(void)
+/* Caller must acquire the list_lock spinlock */
+static void print_active_locks(int type)
 {
-        number_of_wakelocks++;
+        struct wake_lock *lock;
+        bool print_expired = true;
+        BUG_ON(type >= WAKE_LOCK_TYPE_COUNT);
+        list_for_each_entry(lock, &active_wake_locks[type], link) {
+                if (lock->flags & WAKE_LOCK_AUTO_EXPIRE) {
+                        long timeout = lock->expires - jiffies;
+                        if (timeout > 0)
+                                pr_info("active wake lock %s, time left %ld\n",
+                                        lock->name, timeout);
+                        else if (print_expired)
+                                pr_info("wake lock %s, expired\n", lock->name);
+                } else {
+                        pr_info("active wake lock %s\n", lock->name);
+                        if (!(debug_mask & DEBUG_EXPIRE))
+                                print_expired = false;
+                }
+        }
 }
-static inline void decrement_wakelocks_number(void)
+static long has_wake_lock_locked(int type)
 {
-        number_of_wakelocks--;
+        struct wake_lock *lock, *n;
+        long max_timeout = 0;
+        BUG_ON(type >= WAKE_LOCK_TYPE_COUNT);
+        list_for_each_entry_safe(lock, n, &active_wake_locks[type], link) {
+                if (lock->flags & WAKE_LOCK_AUTO_EXPIRE) {
+                        long timeout = lock->expires - jiffies;
+                        if (timeout <= 0)
+                                expire_wake_lock(lock);
+                        else if (timeout > max_timeout)
+                                max_timeout = timeout;
+                } else
+                        return -1;
+        }
+        return max_timeout;
 }
-#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */
-static inline bool wakelocks_limit_exceeded(void) { return false; }
-static inline void increment_wakelocks_number(void) {}
-static inline void decrement_wakelocks_number(void) {}
-#endif /* CONFIG_PM_WAKELOCKS_LIMIT */
-#ifdef CONFIG_PM_WAKELOCKS_GC
+long has_wake_lock(int type)
-#define WL_GC_COUNT_MAX 100
-#define WL_GC_TIME_SEC  300
-static LIST_HEAD(wakelocks_lru_list);
-static unsigned int wakelocks_gc_count;
-static inline void wakelocks_lru_add(struct wakelock *wl)
 {
-        list_add(&wl->lru, &wakelocks_lru_list);
+        long ret;
+        unsigned long irqflags;
+        spin_lock_irqsave(&list_lock, irqflags);
+        ret = has_wake_lock_locked(type);
+        if (ret && (debug_mask & DEBUG_WAKEUP) && type == WAKE_LOCK_SUSPEND)
+                print_active_locks(type);
+        spin_unlock_irqrestore(&list_lock, irqflags);
+        return ret;
 }
-static inline void wakelocks_lru_most_recent(struct wakelock *wl)
+static void suspend_backoff(void)
 {
-        list_move(&wl->lru, &wakelocks_lru_list);
+        pr_info("suspend: too many immediate wakeups, back off\n");
+        wake_lock_timeout(&suspend_backoff_lock,
+                          msecs_to_jiffies(SUSPEND_BACKOFF_INTERVAL));
 }
-static void wakelocks_gc(void)
+static void suspend(struct work_struct *work)
 {
-        struct wakelock *wl, *aux;
+        int ret;
-        ktime_t now;
+        int entry_event_num;
+        struct timespec ts_entry, ts_exit;
-        if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
+        if (has_wake_lock(WAKE_LOCK_SUSPEND)) {
+                if (debug_mask & DEBUG_SUSPEND)
+                        pr_info("suspend: abort suspend\n");
                return;
+        }
-        now = ktime_get();
+        entry_event_num = current_event_num;
-        list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
+        sys_sync();
-                u64 idle_time_ns;
+        if (debug_mask & DEBUG_SUSPEND)
-                bool active;
+                pr_info("suspend: enter suspend\n");
+        getnstimeofday(&ts_entry);
-                spin_lock_irq(&wl->ws.lock);
+        ret = pm_suspend(requested_suspend_state);
-                idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time));
+        getnstimeofday(&ts_exit);
-                active = wl->ws.active;
-                spin_unlock_irq(&wl->ws.lock);
+        if (debug_mask & DEBUG_EXIT_SUSPEND) {
+                struct rtc_time tm;
-                if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC))
+                rtc_time_to_tm(ts_exit.tv_sec, &tm);
-                        break;
+                pr_info("suspend: exit suspend, ret = %d "
+                        "(%d-%02d-%02d %02d:%02d:%02d.%09lu UTC)\n", ret,
-                if (!active) {
+                        tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
-                        wakeup_source_remove(&wl->ws);
+                        tm.tm_hour, tm.tm_min, tm.tm_sec, ts_exit.tv_nsec);
-                        rb_erase(&wl->node, &wakelocks_tree);
-                        list_del(&wl->lru);
-                        kfree(wl->name);
-                        kfree(wl);
-                        decrement_wakelocks_number();
-                }
        }
-        wakelocks_gc_count = 0;
-}
+        if (ts_exit.tv_sec - ts_entry.tv_sec <= 1) {
-#else /* !CONFIG_PM_WAKELOCKS_GC */
+                ++suspend_short_count;
-static inline void wakelocks_lru_add(struct wakelock *wl) {}
-static inline void wakelocks_lru_most_recent(struct wakelock *wl) {}
+                if (suspend_short_count == SUSPEND_BACKOFF_THRESHOLD) {
-static inline void wakelocks_gc(void) {}
+                        suspend_backoff();
-#endif /* !CONFIG_PM_WAKELOCKS_GC */
+                        suspend_short_count = 0;
-static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
-                                            bool add_if_not_found)
-{
-        struct rb_node **node = &wakelocks_tree.rb_node;
-        struct rb_node *parent = *node;
-        struct wakelock *wl;
-        while (*node) {
-                int diff;
-                parent = *node;
-                wl = rb_entry(*node, struct wakelock, node);
-                diff = strncmp(name, wl->name, len);
-                if (diff == 0) {
-                        if (wl->name[len])
-                                diff = -1;
-                        else
-                                return wl;
                }
-                if (diff < 0)
+        } else {
-                        node = &(*node)->rb_left;
+                suspend_short_count = 0;
-                else
-                        node = &(*node)->rb_right;
        }
-        if (!add_if_not_found)
-                return ERR_PTR(-EINVAL);
-        if (wakelocks_limit_exceeded())
+        if (current_event_num == entry_event_num) {
-                return ERR_PTR(-ENOSPC);
+                if (debug_mask & DEBUG_SUSPEND)
+                        pr_info("suspend: pm_suspend returned with no event\n");
+                wake_lock_timeout(&unknown_wakeup, HZ / 2);
+        }
+}
+static DECLARE_WORK(suspend_work, suspend);
-        /* Not found, we have to add a new one. */
+static void expire_wake_locks(unsigned long data)
-        wl = kzalloc(sizeof(*wl), GFP_KERNEL);
+{
-        if (!wl)
+        long has_lock;
-                return ERR_PTR(-ENOMEM);
+        unsigned long irqflags;
+        if (debug_mask & DEBUG_EXPIRE)
+                pr_info("expire_wake_locks: start\n");
+        spin_lock_irqsave(&list_lock, irqflags);
+        if (debug_mask & DEBUG_SUSPEND)
+                print_active_locks(WAKE_LOCK_SUSPEND);
+        has_lock = has_wake_lock_locked(WAKE_LOCK_SUSPEND);
+        if (debug_mask & DEBUG_EXPIRE)
+                pr_info("expire_wake_locks: done, has_lock %ld\n", has_lock);
+        if (has_lock == 0)
+                queue_work(suspend_work_queue, &suspend_work);
+        spin_unlock_irqrestore(&list_lock, irqflags);
+}
+static DEFINE_TIMER(expire_timer, expire_wake_locks, 0, 0);
-        wl->name = kstrndup(name, len, GFP_KERNEL);
+static int power_suspend_late(struct device *dev)
-        if (!wl->name) {
+{
-                kfree(wl);
+        int ret = has_wake_lock(WAKE_LOCK_SUSPEND) ? -EAGAIN : 0;
-                return ERR_PTR(-ENOMEM);
+#ifdef CONFIG_WAKELOCK_STAT
-        }
+        wait_for_wakeup = !ret;
-        wl->ws.name = wl->name;
+#endif
-        wakeup_source_add(&wl->ws);
+        if (debug_mask & DEBUG_SUSPEND)
-        rb_link_node(&wl->node, parent, node);
+                pr_info("power_suspend_late return %d\n", ret);
-        rb_insert_color(&wl->node, &wakelocks_tree);
+        return ret;
-        wakelocks_lru_add(wl);
-        increment_wakelocks_number();
-        return wl;
 }
-int pm_wake_lock(const char *buf)
+static struct dev_pm_ops power_driver_pm_ops = {
+        .suspend_noirq = power_suspend_late,
+};
+static struct platform_driver power_driver = {
+        .driver.name = "power",
+        .driver.pm = &power_driver_pm_ops,
+};
+static struct platform_device power_device = {
+        .name = "power",
+};
+void wake_lock_init(struct wake_lock *lock, int type, const char *name)
 {
-        const char *str = buf;
+        unsigned long irqflags = 0;
-        struct wakelock *wl;
-        u64 timeout_ns = 0;
+        if (name)
-        size_t len;
+                lock->name = name;
-        int ret = 0;
+        BUG_ON(!lock->name);
-        if (!capable(CAP_BLOCK_SUSPEND))
+        if (debug_mask & DEBUG_WAKE_LOCK)
-                return -EPERM;
+                pr_info("wake_lock_init name=%s\n", lock->name);
+#ifdef CONFIG_WAKELOCK_STAT
-        while (*str && !isspace(*str))
+        lock->stat.count = 0;
-                str++;
+        lock->stat.expire_count = 0;
+        lock->stat.wakeup_count = 0;
-        len = str - buf;
+        lock->stat.total_time = ktime_set(0, 0);
-        if (!len)
+        lock->stat.prevent_suspend_time = ktime_set(0, 0);
-                return -EINVAL;
+        lock->stat.max_time = ktime_set(0, 0);
+        lock->stat.last_time = ktime_set(0, 0);
-        if (*str && *str != '\n') {
+#endif
-                /* Find out if there's a valid timeout string appended. */
+        lock->flags = (type & WAKE_LOCK_TYPE_MASK) | WAKE_LOCK_INITIALIZED;
-                ret = kstrtou64(skip_spaces(str), 10, &timeout_ns);
-                if (ret)
-                        return -EINVAL;
-        }
-        mutex_lock(&wakelocks_lock);
+        INIT_LIST_HEAD(&lock->link);
+        spin_lock_irqsave(&list_lock, irqflags);
+        list_add(&lock->link, &inactive_locks);
+        spin_unlock_irqrestore(&list_lock, irqflags);
+}
+EXPORT_SYMBOL(wake_lock_init);
-        wl = wakelock_lookup_add(buf, len, true);
+void wake_lock_destroy(struct wake_lock *lock)
-        if (IS_ERR(wl)) {
+{
-                ret = PTR_ERR(wl);
+        unsigned long irqflags;
-                goto out;
+        if (debug_mask & DEBUG_WAKE_LOCK)
+                pr_info("wake_lock_destroy name=%s\n", lock->name);
+        spin_lock_irqsave(&list_lock, irqflags);
+        lock->flags &= ~WAKE_LOCK_INITIALIZED;
+#ifdef CONFIG_WAKELOCK_STAT
+        if (lock->stat.count) {
+                deleted_wake_locks.stat.count += lock->stat.count;
+                deleted_wake_locks.stat.expire_count += lock->stat.expire_count;
+                deleted_wake_locks.stat.total_time =
+                        ktime_add(deleted_wake_locks.stat.total_time,
+                                  lock->stat.total_time);
+                deleted_wake_locks.stat.prevent_suspend_time =
+                        ktime_add(deleted_wake_locks.stat.prevent_suspend_time,
+                                  lock->stat.prevent_suspend_time);
+                deleted_wake_locks.stat.max_time =
+                        ktime_add(deleted_wake_locks.stat.max_time,
+                                  lock->stat.max_time);
        }
-        if (timeout_ns) {
+#endif
-                u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1;
+        list_del(&lock->link);
+        spin_unlock_irqrestore(&list_lock, irqflags);
+}
+EXPORT_SYMBOL(wake_lock_destroy);
-                do_div(timeout_ms, NSEC_PER_MSEC);
+static void wake_lock_internal(
-                __pm_wakeup_event(&wl->ws, timeout_ms);
+        struct wake_lock *lock, long timeout, int has_timeout)
+{
+        int type;
+        unsigned long irqflags;
+        long expire_in;
+        spin_lock_irqsave(&list_lock, irqflags);
+        type = lock->flags & WAKE_LOCK_TYPE_MASK;
+        BUG_ON(type >= WAKE_LOCK_TYPE_COUNT);
+        BUG_ON(!(lock->flags & WAKE_LOCK_INITIALIZED));
+#ifdef CONFIG_WAKELOCK_STAT
+        if (type == WAKE_LOCK_SUSPEND && wait_for_wakeup) {
+                if (debug_mask & DEBUG_WAKEUP)
+                        pr_info("wakeup wake lock: %s\n", lock->name);
+                wait_for_wakeup = 0;
+                lock->stat.wakeup_count++;
+        }
+        if ((lock->flags & WAKE_LOCK_AUTO_EXPIRE) &&
+            (long)(lock->expires - jiffies) <= 0) {
+                wake_unlock_stat_locked(lock, 0);
+                lock->stat.last_time = ktime_get();
+        }
+#endif
+        if (!(lock->flags & WAKE_LOCK_ACTIVE)) {
+                lock->flags |= WAKE_LOCK_ACTIVE;
+#ifdef CONFIG_WAKELOCK_STAT
+                lock->stat.last_time = ktime_get();
+#endif
+        }
+        list_del(&lock->link);
+        if (has_timeout) {
+                if (debug_mask & DEBUG_WAKE_LOCK)
+                        pr_info("wake_lock: %s, type %d, timeout %ld.%03lu\n",
+                                lock->name, type, timeout / HZ,
+                                (timeout % HZ) * MSEC_PER_SEC / HZ);
+                lock->expires = jiffies + timeout;
+                lock->flags |= WAKE_LOCK_AUTO_EXPIRE;
+                list_add_tail(&lock->link, &active_wake_locks[type]);
        } else {
-                __pm_stay_awake(&wl->ws);
+                if (debug_mask & DEBUG_WAKE_LOCK)
+                        pr_info("wake_lock: %s, type %d\n", lock->name, type);
+                lock->expires = LONG_MAX;
+                lock->flags &= ~WAKE_LOCK_AUTO_EXPIRE;
+                list_add(&lock->link, &active_wake_locks[type]);
+        }
+        if (type == WAKE_LOCK_SUSPEND) {
+                current_event_num++;
+#ifdef CONFIG_WAKELOCK_STAT
+                if (lock == &main_wake_lock)
+                        update_sleep_wait_stats_locked(1);
+                else if (!wake_lock_active(&main_wake_lock))
+                        update_sleep_wait_stats_locked(0);
+#endif
+                if (has_timeout)
+                        expire_in = has_wake_lock_locked(type);
+                else
+                        expire_in = -1;
+                if (expire_in > 0) {
+                        if (debug_mask & DEBUG_EXPIRE)
+                                pr_info("wake_lock: %s, start expire timer, "
+                                        "%ld\n", lock->name, expire_in);
+                        mod_timer(&expire_timer, jiffies + expire_in);
+                } else {
+                        if (del_timer(&expire_timer))
+                                if (debug_mask & DEBUG_EXPIRE)
+                                        pr_info("wake_lock: %s, stop expire timer\n",
+                                                lock->name);
+                        if (expire_in == 0)
+                                queue_work(suspend_work_queue, &suspend_work);
+                }
        }
+        spin_unlock_irqrestore(&list_lock, irqflags);
+}
-        wakelocks_lru_most_recent(wl);
+void wake_lock(struct wake_lock *lock)
+{
+        wake_lock_internal(lock, 0, 0);
+}
+EXPORT_SYMBOL(wake_lock);
- out:
+void wake_lock_timeout(struct wake_lock *lock, long timeout)
-        mutex_unlock(&wakelocks_lock);
+{
-        return ret;
+        wake_lock_internal(lock, timeout, 1);
 }
+EXPORT_SYMBOL(wake_lock_timeout);
-int pm_wake_unlock(const char *buf)
+void wake_unlock(struct wake_lock *lock)
 {
-        struct wakelock *wl;
+        int type;
-        size_t len;
+        unsigned long irqflags;
-        int ret = 0;
+        spin_lock_irqsave(&list_lock, irqflags);
+        type = lock->flags & WAKE_LOCK_TYPE_MASK;
+#ifdef CONFIG_WAKELOCK_STAT
+        wake_unlock_stat_locked(lock, 0);
+#endif
+        if (debug_mask & DEBUG_WAKE_LOCK)
+                pr_info("wake_unlock: %s\n", lock->name);
+        lock->flags &= ~(WAKE_LOCK_ACTIVE | WAKE_LOCK_AUTO_EXPIRE);
+        list_del(&lock->link);
+        list_add(&lock->link, &inactive_locks);
+        if (type == WAKE_LOCK_SUSPEND) {
+                long has_lock = has_wake_lock_locked(type);
+                if (has_lock > 0) {
+                        if (debug_mask & DEBUG_EXPIRE)
+                                pr_info("wake_unlock: %s, start expire timer, "
+                                        "%ld\n", lock->name, has_lock);
+                        mod_timer(&expire_timer, jiffies + has_lock);
+                } else {
+                        if (del_timer(&expire_timer))
+                                if (debug_mask & DEBUG_EXPIRE)
+                                        pr_info("wake_unlock: %s, stop expire "
+                                                "timer\n", lock->name);
+                        if (has_lock == 0)
+                                queue_work(suspend_work_queue, &suspend_work);
+                }
+                if (lock == &main_wake_lock) {
+                        if (debug_mask & DEBUG_SUSPEND)
+                                print_active_locks(WAKE_LOCK_SUSPEND);
+#ifdef CONFIG_WAKELOCK_STAT
+                        update_sleep_wait_stats_locked(0);
+#endif
+                }
+        }
+        spin_unlock_irqrestore(&list_lock, irqflags);
+}
+EXPORT_SYMBOL(wake_unlock);
-        if (!capable(CAP_BLOCK_SUSPEND))
+int wake_lock_active(struct wake_lock *lock)
-                return -EPERM;
+{
+        return !!(lock->flags & WAKE_LOCK_ACTIVE);
+}
+EXPORT_SYMBOL(wake_lock_active);
+static int wakelock_stats_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, wakelock_stats_show, NULL);
+}
-        len = strlen(buf);
+static const struct file_operations wakelock_stats_fops = {
-        if (!len)
+        .owner = THIS_MODULE,
-                return -EINVAL;
+        .open = wakelock_stats_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
-        if (buf[len-1] == '\n')
+static int __init wakelocks_init(void)
-                len--;
+{
+        int ret;
+        int i;
-        if (!len)
+        for (i = 0; i < ARRAY_SIZE(active_wake_locks); i++)
-                return -EINVAL;
+                INIT_LIST_HEAD(&active_wake_locks[i]);
-        mutex_lock(&wakelocks_lock);
+#ifdef CONFIG_WAKELOCK_STAT
+        wake_lock_init(&deleted_wake_locks, WAKE_LOCK_SUSPEND,
+                        "deleted_wake_locks");
+#endif
+        wake_lock_init(&main_wake_lock, WAKE_LOCK_SUSPEND, "main");
+        wake_lock(&main_wake_lock);
+        wake_lock_init(&unknown_wakeup, WAKE_LOCK_SUSPEND, "unknown_wakeups");
+        wake_lock_init(&suspend_backoff_lock, WAKE_LOCK_SUSPEND,
+                       "suspend_backoff");
+        ret = platform_device_register(&power_device);
+        if (ret) {
+                pr_err("wakelocks_init: platform_device_register failed\n");
+                goto err_platform_device_register;
+        }
+        ret = platform_driver_register(&power_driver);
+        if (ret) {
+                pr_err("wakelocks_init: platform_driver_register failed\n");
+                goto err_platform_driver_register;
+        }
-        wl = wakelock_lookup_add(buf, len, false);
+        suspend_work_queue = create_singlethread_workqueue("suspend");
-        if (IS_ERR(wl)) {
+        if (suspend_work_queue == NULL) {
-                ret = PTR_ERR(wl);
+                ret = -ENOMEM;
-                goto out;
+                goto err_suspend_work_queue;
        }
-        __pm_relax(&wl->ws);
-        wakelocks_lru_most_recent(wl);
+#ifdef CONFIG_WAKELOCK_STAT
-        wakelocks_gc();
+        proc_create("wakelocks", S_IRUGO, NULL, &wakelock_stats_fops);
+#endif
- out:
+        return 0;
-        mutex_unlock(&wakelocks_lock);
+err_suspend_work_queue:
+        platform_driver_unregister(&power_driver);
+err_platform_driver_register:
+        platform_device_unregister(&power_device);
+err_platform_device_register:
+        wake_lock_destroy(&suspend_backoff_lock);
+        wake_lock_destroy(&unknown_wakeup);
+        wake_lock_destroy(&main_wake_lock);
+#ifdef CONFIG_WAKELOCK_STAT
+        wake_lock_destroy(&deleted_wake_locks);
+#endif
        return ret;
 }
+static void  __exit wakelocks_exit(void)
+{
+#ifdef CONFIG_WAKELOCK_STAT
+        remove_proc_entry("wakelocks", NULL);
+#endif
+        destroy_workqueue(suspend_work_queue);
+        platform_driver_unregister(&power_driver);
+        platform_device_unregister(&power_device);
+        wake_lock_destroy(&suspend_backoff_lock);
+        wake_lock_destroy(&unknown_wakeup);
+        wake_lock_destroy(&main_wake_lock);
+#ifdef CONFIG_WAKELOCK_STAT
+        wake_lock_destroy(&deleted_wake_locks);
+#endif
+}
+core_initcall(wakelocks_init);
+module_exit(wakelocks_exit);
diff --git a/kernel/printk.c b/kernel/printk.c
index 357f714ddd4..1baace7d867 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -41,13 +41,9 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/rculist.h>
-#include <linux/poll.h>
 #include <asm/uaccess.h>
-#define CREATE_TRACE_POINTS
-#include <trace/events/printk.h>
 /*
 * Architectures can override it:
 */
@@ -55,6 +51,12 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 {
 }
+#define __LOG_BUF_LEN   (1 << CONFIG_LOG_BUF_SHIFT)
+#ifdef        CONFIG_DEBUG_LL
+extern void printascii(char *);
+#endif
 /* printk's without a loglevel use this.. */
 #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
@@ -87,12 +89,6 @@ static DEFINE_SEMAPHORE(console_sem);
 struct console *console_drivers;
 EXPORT_SYMBOL_GPL(console_drivers);
-#ifdef CONFIG_LOCKDEP
-static struct lockdep_map console_lock_dep_map = {
-        .name = "console_lock"
-};
-#endif
 /*
 * This is used for debugging the mess that is the VT code by
 * keeping track if we have the console semaphore held. It's
@@ -104,6 +100,24 @@ static struct lockdep_map console_lock_dep_map = {
 static int console_locked, console_suspended;
 /*
+ * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
+ * It is also used in interesting ways to provide interlocking in
+ * console_unlock();.
+ */
+static DEFINE_SPINLOCK(logbuf_lock);
+#define LOG_BUF_MASK (log_buf_len-1)
+#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
+/*
+ * The indices into log_buf are not constrained to log_buf_len - they
+ * must be masked before subscripting
+ */
+static unsigned log_start;      /* Index into log_buf: next char to be read by syslog() */
+static unsigned con_start;      /* Index into log_buf: next char to be sent to consoles */
+static unsigned log_end;        /* Index into log_buf: most-recently-written-char + 1 */
+/*
 * If exclusive_console is non-NULL then only this console is to be printed to.
 */
 static struct console *exclusive_console;
@@ -132,537 +146,13 @@ EXPORT_SYMBOL(console_set_on_cmdline);
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
-/*
- * The printk log buffer consists of a chain of concatenated variable
- * length records. Every record starts with a record header, containing
- * the overall length of the record.
- *
- * The heads to the first and last entry in the buffer, as well as the
- * sequence numbers of these both entries are maintained when messages
- * are stored..
- *
- * If the heads indicate available messages, the length in the header
- * tells the start next message. A length == 0 for the next message
- * indicates a wrap-around to the beginning of the buffer.
- *
- * Every record carries the monotonic timestamp in microseconds, as well as
- * the standard userspace syslog level and syslog facility. The usual
- * kernel messages use LOG_KERN; userspace-injected messages always carry
- * a matching syslog facility, by default LOG_USER. The origin of every
- * message can be reliably determined that way.
- *
- * The human readable log message directly follows the message header. The
- * length of the message text is stored in the header, the stored message
- * is not terminated.
- *
- * Optionally, a message can carry a dictionary of properties (key/value pairs),
- * to provide userspace with a machine-readable message context.
- *
- * Examples for well-defined, commonly used property names are:
- *   DEVICE=b12:8               device identifier
- *                                b12:8         block dev_t
- *                                c127:3        char dev_t
- *                                n8            netdev ifindex
- *                                +sound:card0  subsystem:devname
- *   SUBSYSTEM=pci              driver-core subsystem name
- *
- * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
- * follows directly after a '=' character. Every property is terminated by
- * a '\0' character. The last property is not terminated.
- *
- * Example of a message structure:
- *   0000  ff 8f 00 00 00 00 00 00      monotonic time in nsec
- *   0008  34 00                        record is 52 bytes long
- *   000a        0b 00                  text is 11 bytes long
- *   000c              1f 00            dictionary is 23 bytes long
- *   000e                    03 00      LOG_KERN (facility) LOG_ERR (level)
- *   0010  69 74 27 73 20 61 20 6c      "it's a l"
- *         69 6e 65                     "ine"
- *   001b           44 45 56 49 43      "DEVIC"
- *         45 3d 62 38 3a 32 00 44      "E=b8:2\0D"
- *         52 49 56 45 52 3d 62 75      "RIVER=bu"
- *         67                           "g"
- *   0032     00 00 00                  padding to next message header
- *
- * The 'struct log' buffer header must never be directly exported to
- * userspace, it is a kernel-private implementation detail that might
- * need to be changed in the future, when the requirements change.
- *
- * /dev/kmsg exports the structured data in the following line format:
- *   "level,sequnum,timestamp;<message text>\n"
- *
- * The optional key/value pairs are attached as continuation lines starting
- * with a space character and terminated by a newline. All possible
- * non-prinatable characters are escaped in the "\xff" notation.
- *
- * Users of the export format should ignore possible additional values
- * separated by ',', and find the message after the ';' character.
- */
-enum log_flags {
-        LOG_NOCONS      = 1,    /* already flushed, do not print to console */
-        LOG_NEWLINE     = 2,    /* text ended with a newline */
-        LOG_PREFIX      = 4,    /* text started with a prefix */
-        LOG_CONT        = 8,    /* text is a fragment of a continuation line */
-};
-struct log {
-        u64 ts_nsec;            /* timestamp in nanoseconds */
-        u16 len;                /* length of entire record */
-        u16 text_len;           /* length of text buffer */
-        u16 dict_len;           /* length of dictionary buffer */
-        u8 facility;            /* syslog facility */
-        u8 flags:5;             /* internal record flags */
-        u8 level:3;             /* syslog level */
-};
-/*
- * The logbuf_lock protects kmsg buffer, indices, counters. It is also
- * used in interesting ways to provide interlocking in console_unlock();
- */
-static DEFINE_RAW_SPINLOCK(logbuf_lock);
 #ifdef CONFIG_PRINTK
-/* the next printk record to read by syslog(READ) or /proc/kmsg */
-static u64 syslog_seq;
-static u32 syslog_idx;
-static enum log_flags syslog_prev;
-static size_t syslog_partial;
-/* index and sequence number of the first record stored in the buffer */
-static u64 log_first_seq;
-static u32 log_first_idx;
-/* index and sequence number of the next record to store in the buffer */
-static u64 log_next_seq;
-static u32 log_next_idx;
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u32 console_idx;
-static enum log_flags console_prev;
-/* the next printk record to read after the last 'clear' command */
-static u64 clear_seq;
-static u32 clear_idx;
-#define PREFIX_MAX              32
-#define LOG_LINE_MAX            1024 - PREFIX_MAX
-/* record buffer */
-#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
-#define LOG_ALIGN 4
-#else
-#define LOG_ALIGN __alignof__(struct log)
-#endif
-#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
-static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
-static char *log_buf = __log_buf;
-static u32 log_buf_len = __LOG_BUF_LEN;
-/* cpu currently holding logbuf_lock */
-static volatile unsigned int logbuf_cpu = UINT_MAX;
-/* human readable text of the record */
-static char *log_text(const struct log *msg)
-{
-        return (char *)msg + sizeof(struct log);
-}
-/* optional key/value pair dictionary attached to the record */
-static char *log_dict(const struct log *msg)
-{
-        return (char *)msg + sizeof(struct log) + msg->text_len;
-}
-/* get record by index; idx must point to valid msg */
-static struct log *log_from_idx(u32 idx)
-{
-        struct log *msg = (struct log *)(log_buf + idx);
-        /*
-         * A length == 0 record is the end of buffer marker. Wrap around and
-         * read the message at the start of the buffer.
-         */
-        if (!msg->len)
-                return (struct log *)log_buf;
-        return msg;
-}
-/* get next record; idx must point to valid msg */
-static u32 log_next(u32 idx)
-{
-        struct log *msg = (struct log *)(log_buf + idx);
-        /* length == 0 indicates the end of the buffer; wrap */
-        /*
-         * A length == 0 record is the end of buffer marker. Wrap around and
-         * read the message at the start of the buffer as *this* one, and
-         * return the one after that.
-         */
-        if (!msg->len) {
-                msg = (struct log *)log_buf;
-                return msg->len;
-        }
-        return idx + msg->len;
-}
-/* insert record into the buffer, discard old ones, update heads */
-static void log_store(int facility, int level,
-                      enum log_flags flags, u64 ts_nsec,
-                      const char *dict, u16 dict_len,
-                      const char *text, u16 text_len)
-{
-        struct log *msg;
-        u32 size, pad_len;
-        /* number of '\0' padding bytes to next message */
-        size = sizeof(struct log) + text_len + dict_len;
-        pad_len = (-size) & (LOG_ALIGN - 1);
-        size += pad_len;
-        while (log_first_seq < log_next_seq) {
-                u32 free;
-                if (log_next_idx > log_first_idx)
-                        free = max(log_buf_len - log_next_idx, log_first_idx);
-                else
-                        free = log_first_idx - log_next_idx;
-                if (free > size + sizeof(struct log))
-                        break;
-                /* drop old messages until we have enough contiuous space */
-                log_first_idx = log_next(log_first_idx);
-                log_first_seq++;
-        }
-        if (log_next_idx + size + sizeof(struct log) >= log_buf_len) {
-                /*
-                 * This message + an additional empty header does not fit
-                 * at the end of the buffer. Add an empty header with len == 0
-                 * to signify a wrap around.
-                 */
-                memset(log_buf + log_next_idx, 0, sizeof(struct log));
-                log_next_idx = 0;
-        }
-        /* fill message */
-        msg = (struct log *)(log_buf + log_next_idx);
-        memcpy(log_text(msg), text, text_len);
-        msg->text_len = text_len;
-        memcpy(log_dict(msg), dict, dict_len);
-        msg->dict_len = dict_len;
-        msg->facility = facility;
-        msg->level = level & 7;
-        msg->flags = flags & 0x1f;
-        if (ts_nsec > 0)
-                msg->ts_nsec = ts_nsec;
-        else
-                msg->ts_nsec = local_clock();
-        memset(log_dict(msg) + dict_len, 0, pad_len);
-        msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
-        /* insert message */
-        log_next_idx += msg->len;
-        log_next_seq++;
-}
-/* /dev/kmsg - userspace message inject/listen interface */
-struct devkmsg_user {
-        u64 seq;
-        u32 idx;
-        enum log_flags prev;
-        struct mutex lock;
-        char buf[8192];
-};
-static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
-                              unsigned long count, loff_t pos)
-{
-        char *buf, *line;
-        int i;
-        int level = default_message_loglevel;
-        int facility = 1;       /* LOG_USER */
-        size_t len = iov_length(iv, count);
-        ssize_t ret = len;
-        if (len > LOG_LINE_MAX)
-                return -EINVAL;
-        buf = kmalloc(len+1, GFP_KERNEL);
-        if (buf == NULL)
-                return -ENOMEM;
-        line = buf;
-        for (i = 0; i < count; i++) {
-                if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
-                        ret = -EFAULT;
-                        goto out;
-                }
-                line += iv[i].iov_len;
-        }
-        /*
-         * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
-         * the decimal value represents 32bit, the lower 3 bit are the log
-         * level, the rest are the log facility.
-         *
-         * If no prefix or no userspace facility is specified, we
-         * enforce LOG_USER, to be able to reliably distinguish
-         * kernel-generated messages from userspace-injected ones.
-         */
-        line = buf;
-        if (line[0] == '<') {
-                char *endp = NULL;
-                i = simple_strtoul(line+1, &endp, 10);
-                if (endp && endp[0] == '>') {
-                        level = i & 7;
-                        if (i >> 3)
-                                facility = i >> 3;
-                        endp++;
-                        len -= endp - line;
-                        line = endp;
-                }
-        }
-        line[len] = '\0';
-        printk_emit(facility, level, NULL, 0, "%s", line);
-out:
-        kfree(buf);
-        return ret;
-}
-static ssize_t devkmsg_read(struct file *file, char __user *buf,
-                            size_t count, loff_t *ppos)
-{
-        struct devkmsg_user *user = file->private_data;
-        struct log *msg;
-        u64 ts_usec;
-        size_t i;
-        char cont = '-';
-        size_t len;
-        ssize_t ret;
-        if (!user)
-                return -EBADF;
-        ret = mutex_lock_interruptible(&user->lock);
-        if (ret)
-                return ret;
-        raw_spin_lock_irq(&logbuf_lock);
-        while (user->seq == log_next_seq) {
-                if (file->f_flags & O_NONBLOCK) {
-                        ret = -EAGAIN;
-                        raw_spin_unlock_irq(&logbuf_lock);
-                        goto out;
-                }
-                raw_spin_unlock_irq(&logbuf_lock);
-                ret = wait_event_interruptible(log_wait,
-                                               user->seq != log_next_seq);
-                if (ret)
-                        goto out;
-                raw_spin_lock_irq(&logbuf_lock);
-        }
-        if (user->seq < log_first_seq) {
-                /* our last seen message is gone, return error and reset */
-                user->idx = log_first_idx;
-                user->seq = log_first_seq;
-                ret = -EPIPE;
-                raw_spin_unlock_irq(&logbuf_lock);
-                goto out;
-        }
-        msg = log_from_idx(user->idx);
-        ts_usec = msg->ts_nsec;
-        do_div(ts_usec, 1000);
-        /*
-         * If we couldn't merge continuation line fragments during the print,
-         * export the stored flags to allow an optional external merge of the
-         * records. Merging the records isn't always neccessarily correct, like
-         * when we hit a race during printing. In most cases though, it produces
-         * better readable output. 'c' in the record flags mark the first
-         * fragment of a line, '+' the following.
-         */
-        if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
-                cont = 'c';
-        else if ((msg->flags & LOG_CONT) ||
-                 ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
-                cont = '+';
-        len = sprintf(user->buf, "%u,%llu,%llu,%c;",
-                      (msg->facility << 3) | msg->level,
-                      user->seq, ts_usec, cont);
-        user->prev = msg->flags;
-        /* escape non-printable characters */
-        for (i = 0; i < msg->text_len; i++) {
-                unsigned char c = log_text(msg)[i];
-                if (c < ' ' || c >= 127 || c == '\\')
-                        len += sprintf(user->buf + len, "\\x%02x", c);
-                else
-                        user->buf[len++] = c;
-        }
-        user->buf[len++] = '\n';
-        if (msg->dict_len) {
-                bool line = true;
-                for (i = 0; i < msg->dict_len; i++) {
-                        unsigned char c = log_dict(msg)[i];
-                        if (line) {
-                                user->buf[len++] = ' ';
-                                line = false;
-                        }
-                        if (c == '\0') {
-                                user->buf[len++] = '\n';
-                                line = true;
-                                continue;
-                        }
-                        if (c < ' ' || c >= 127 || c == '\\') {
-                                len += sprintf(user->buf + len, "\\x%02x", c);
-                                continue;
-                        }
-                        user->buf[len++] = c;
-                }
-                user->buf[len++] = '\n';
-        }
-        user->idx = log_next(user->idx);
-        user->seq++;
-        raw_spin_unlock_irq(&logbuf_lock);
-        if (len > count) {
-                ret = -EINVAL;
-                goto out;
-        }
-        if (copy_to_user(buf, user->buf, len)) {
-                ret = -EFAULT;
-                goto out;
-        }
-        ret = len;
-out:
-        mutex_unlock(&user->lock);
-        return ret;
-}
-static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
-{
-        struct devkmsg_user *user = file->private_data;
-        loff_t ret = 0;
-        if (!user)
-                return -EBADF;
-        if (offset)
-                return -ESPIPE;
-        raw_spin_lock_irq(&logbuf_lock);
-        switch (whence) {
-        case SEEK_SET:
-                /* the first record */
-                user->idx = log_first_idx;
-                user->seq = log_first_seq;
-                break;
-        case SEEK_DATA:
-                /*
-                 * The first record after the last SYSLOG_ACTION_CLEAR,
-                 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
-                 * changes no global state, and does not clear anything.
-                 */
-                user->idx = clear_idx;
-                user->seq = clear_seq;
-                break;
-        case SEEK_END:
-                /* after the last record */
-                user->idx = log_next_idx;
-                user->seq = log_next_seq;
-                break;
-        default:
-                ret = -EINVAL;
-        }
-        raw_spin_unlock_irq(&logbuf_lock);
-        return ret;
-}
-static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
-{
-        struct devkmsg_user *user = file->private_data;
-        int ret = 0;
-        if (!user)
-                return POLLERR|POLLNVAL;
-        poll_wait(file, &log_wait, wait);
-        raw_spin_lock_irq(&logbuf_lock);
-        if (user->seq < log_next_seq) {
-                /* return error when data has vanished underneath us */
-                if (user->seq < log_first_seq)
-                        ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
-                ret = POLLIN|POLLRDNORM;
-        }
-        raw_spin_unlock_irq(&logbuf_lock);
-        return ret;
-}
-static int devkmsg_open(struct inode *inode, struct file *file)
-{
-        struct devkmsg_user *user;
-        int err;
-        /* write-only does not need any file context */
-        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-                return 0;
-        err = security_syslog(SYSLOG_ACTION_READ_ALL);
-        if (err)
-                return err;
-        user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
-        if (!user)
-                return -ENOMEM;
-        mutex_init(&user->lock);
-        raw_spin_lock_irq(&logbuf_lock);
-        user->idx = log_first_idx;
-        user->seq = log_first_seq;
-        raw_spin_unlock_irq(&logbuf_lock);
-        file->private_data = user;
-        return 0;
-}
-static int devkmsg_release(struct inode *inode, struct file *file)
-{
-        struct devkmsg_user *user = file->private_data;
-        if (!user)
-                return 0;
-        mutex_destroy(&user->lock);
+static char __log_buf[__LOG_BUF_LEN];
-        kfree(user);
+static char *log_buf = __log_buf;
-        return 0;
+static int log_buf_len = __LOG_BUF_LEN;
-}
+static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
+static int saved_console_loglevel = -1;
-const struct file_operations kmsg_fops = {
-        .open = devkmsg_open,
-        .read = devkmsg_read,
-        .aio_write = devkmsg_writev,
-        .llseek = devkmsg_llseek,
-        .poll = devkmsg_poll,
-        .release = devkmsg_release,
-};
 #ifdef CONFIG_KEXEC
 /*
@@ -676,18 +166,9 @@ const struct file_operations kmsg_fops = {
 void log_buf_kexec_setup(void)
 {
        VMCOREINFO_SYMBOL(log_buf);
+        VMCOREINFO_SYMBOL(log_end);
        VMCOREINFO_SYMBOL(log_buf_len);
-        VMCOREINFO_SYMBOL(log_first_idx);
+        VMCOREINFO_SYMBOL(logged_chars);
-        VMCOREINFO_SYMBOL(log_next_idx);
-        /*
-         * Export struct log size and field offsets. User space tools can
-         * parse it and detect any changes to structure down the line.
-         */
-        VMCOREINFO_STRUCT_SIZE(log);
-        VMCOREINFO_OFFSET(log, ts_nsec);
-        VMCOREINFO_OFFSET(log, len);
-        VMCOREINFO_OFFSET(log, text_len);
-        VMCOREINFO_OFFSET(log, dict_len);
 }
 #endif
@@ -711,6 +192,7 @@ early_param("log_buf_len", log_buf_len_setup);
 void __init setup_log_buf(int early)
 {
        unsigned long flags;
+        unsigned start, dest_idx, offset;
        char *new_log_buf;
        int free;
@@ -721,7 +203,7 @@ void __init setup_log_buf(int early)
                unsigned long mem;
                mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
-                if (!mem)
+                if (mem == MEMBLOCK_ERROR)
                        return;
                new_log_buf = __va(mem);
        } else {
@@ -734,34 +216,31 @@ void __init setup_log_buf(int early)
                return;
        }
-        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        spin_lock_irqsave(&logbuf_lock, flags);
        log_buf_len = new_log_buf_len;
        log_buf = new_log_buf;
        new_log_buf_len = 0;
-        free = __LOG_BUF_LEN - log_next_idx;
+        free = __LOG_BUF_LEN - log_end;
-        memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
-        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+        offset = start = min(con_start, log_start);
+        dest_idx = 0;
+        while (start != log_end) {
+                unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
+                log_buf[dest_idx] = __log_buf[log_idx_mask];
+                start++;
+                dest_idx++;
+        }
+        log_start -= offset;
+        con_start -= offset;
+        log_end -= offset;
+        spin_unlock_irqrestore(&logbuf_lock, flags);
        pr_info("log_buf_len: %d\n", log_buf_len);
        pr_info("early log buf free: %d(%d%%)\n",
                free, (free * 100) / __LOG_BUF_LEN);
 }
-static bool __read_mostly ignore_loglevel;
-static int __init ignore_loglevel_setup(char *str)
-{
-        ignore_loglevel = 1;
-        printk(KERN_INFO "debug: ignoring loglevel setting.\n");
-        return 0;
-}
-early_param("ignore_loglevel", ignore_loglevel_setup);
-module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
-        "print all kernel messages to the console.");
 #ifdef CONFIG_BOOT_PRINTK_DELAY
 static int boot_delay; /* msecs delay after each printk during bootup */
@@ -785,15 +264,13 @@ static int __init boot_delay_setup(char *str)
 }
 __setup("boot_delay=", boot_delay_setup);
-static void boot_delay_msec(int level)
+static void boot_delay_msec(void)
 {
        unsigned long long k;
        unsigned long timeout;
-        if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
+        if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
-                || (level >= console_loglevel && !ignore_loglevel)) {
                return;
-        }
        k = (unsigned long long)loops_per_msec * boot_delay;
@@ -812,11 +289,58 @@ static void boot_delay_msec(int level)
        }
 }
 #else
-static inline void boot_delay_msec(int level)
+static inline void boot_delay_msec(void)
 {
 }
 #endif
+/*
+ * Return the number of unread characters in the log buffer.
+ */
+static int log_buf_get_len(void)
+{
+        return logged_chars;
+}
+/*
+ * Clears the ring-buffer
+ */
+void log_buf_clear(void)
+{
+        logged_chars = 0;
+}
+/*
+ * Copy a range of characters from the log buffer.
+ */
+int log_buf_copy(char *dest, int idx, int len)
+{
+        int ret, max;
+        bool took_lock = false;
+        if (!oops_in_progress) {
+                spin_lock_irq(&logbuf_lock);
+                took_lock = true;
+        }
+        max = log_buf_get_len();
+        if (idx < 0 || idx >= max) {
+                ret = -1;
+        } else {
+                if (len > max - idx)
+                        len = max - idx;
+                ret = len;
+                idx += (log_end - max);
+                while (len-- > 0)
+                        dest[len] = LOG_BUF(idx + len);
+        }
+        if (took_lock)
+                spin_unlock_irq(&logbuf_lock);
+        return ret;
+}
 #ifdef CONFIG_SECURITY_DMESG_RESTRICT
 int dmesg_restrict = 1;
 #else
@@ -856,275 +380,11 @@ static int check_syslog_permissions(int type, bool from_file)
        return 0;
 }
-#if defined(CONFIG_PRINTK_TIME)
-static bool printk_time = 1;
-#else
-static bool printk_time;
-#endif
-module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
-static size_t print_time(u64 ts, char *buf)
-{
-        unsigned long rem_nsec;
-        if (!printk_time)
-                return 0;
-        rem_nsec = do_div(ts, 1000000000);
-        if (!buf)
-                return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts);
-        return sprintf(buf, "[%5lu.%06lu] ",
-                       (unsigned long)ts, rem_nsec / 1000);
-}
-static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
-{
-        size_t len = 0;
-        unsigned int prefix = (msg->facility << 3) | msg->level;
-        if (syslog) {
-                if (buf) {
-                        len += sprintf(buf, "<%u>", prefix);
-                } else {
-                        len += 3;
-                        if (prefix > 999)
-                                len += 3;
-                        else if (prefix > 99)
-                                len += 2;
-                        else if (prefix > 9)
-                                len++;
-                }
-        }
-        len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
-        return len;
-}
-static size_t msg_print_text(const struct log *msg, enum log_flags prev,
-                             bool syslog, char *buf, size_t size)
-{
-        const char *text = log_text(msg);
-        size_t text_size = msg->text_len;
-        bool prefix = true;
-        bool newline = true;
-        size_t len = 0;
-        if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
-                prefix = false;
-        if (msg->flags & LOG_CONT) {
-                if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
-                        prefix = false;
-                if (!(msg->flags & LOG_NEWLINE))
-                        newline = false;
-        }
-        do {
-                const char *next = memchr(text, '\n', text_size);
-                size_t text_len;
-                if (next) {
-                        text_len = next - text;
-                        next++;
-                        text_size -= next - text;
-                } else {
-                        text_len = text_size;
-                }
-                if (buf) {
-                        if (print_prefix(msg, syslog, NULL) +
-                            text_len + 1 >= size - len)
-                                break;
-                        if (prefix)
-                                len += print_prefix(msg, syslog, buf + len);
-                        memcpy(buf + len, text, text_len);
-                        len += text_len;
-                        if (next || newline)
-                                buf[len++] = '\n';
-                } else {
-                        /* SYSLOG_ACTION_* buffer size only calculation */
-                        if (prefix)
-                                len += print_prefix(msg, syslog, NULL);
-                        len += text_len;
-                        if (next || newline)
-                                len++;
-                }
-                prefix = true;
-                text = next;
-        } while (text);
-        return len;
-}
-static int syslog_print(char __user *buf, int size)
-{
-        char *text;
-        struct log *msg;
-        int len = 0;
-        text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
-        if (!text)
-                return -ENOMEM;
-        while (size > 0) {
-                size_t n;
-                size_t skip;
-                raw_spin_lock_irq(&logbuf_lock);
-                if (syslog_seq < log_first_seq) {
-                        /* messages are gone, move to first one */
-                        syslog_seq = log_first_seq;
-                        syslog_idx = log_first_idx;
-                        syslog_prev = 0;
-                        syslog_partial = 0;
-                }
-                if (syslog_seq == log_next_seq) {
-                        raw_spin_unlock_irq(&logbuf_lock);
-                        break;
-                }
-                skip = syslog_partial;
-                msg = log_from_idx(syslog_idx);
-                n = msg_print_text(msg, syslog_prev, true, text,
-                                   LOG_LINE_MAX + PREFIX_MAX);
-                if (n - syslog_partial <= size) {
-                        /* message fits into buffer, move forward */
-                        syslog_idx = log_next(syslog_idx);
-                        syslog_seq++;
-                        syslog_prev = msg->flags;
-                        n -= syslog_partial;
-                        syslog_partial = 0;
-                } else if (!len){
-                        /* partial read(), remember position */
-                        n = size;
-                        syslog_partial += n;
-                } else
-                        n = 0;
-                raw_spin_unlock_irq(&logbuf_lock);
-                if (!n)
-                        break;
-                if (copy_to_user(buf, text + skip, n)) {
-                        if (!len)
-                                len = -EFAULT;
-                        break;
-                }
-                len += n;
-                size -= n;
-                buf += n;
-        }
-        kfree(text);
-        return len;
-}
-static int syslog_print_all(char __user *buf, int size, bool clear)
-{
-        char *text;
-        int len = 0;
-        text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
-        if (!text)
-                return -ENOMEM;
-        raw_spin_lock_irq(&logbuf_lock);
-        if (buf) {
-                u64 next_seq;
-                u64 seq;
-                u32 idx;
-                enum log_flags prev;
-                if (clear_seq < log_first_seq) {
-                        /* messages are gone, move to first available one */
-                        clear_seq = log_first_seq;
-                        clear_idx = log_first_idx;
-                }
-                /*
-                 * Find first record that fits, including all following records,
-                 * into the user-provided buffer for this dump.
-                 */
-                seq = clear_seq;
-                idx = clear_idx;
-                prev = 0;
-                while (seq < log_next_seq) {
-                        struct log *msg = log_from_idx(idx);
-                        len += msg_print_text(msg, prev, true, NULL, 0);
-                        prev = msg->flags;
-                        idx = log_next(idx);
-                        seq++;
-                }
-                /* move first record forward until length fits into the buffer */
-                seq = clear_seq;
-                idx = clear_idx;
-                prev = 0;
-                while (len > size && seq < log_next_seq) {
-                        struct log *msg = log_from_idx(idx);
-                        len -= msg_print_text(msg, prev, true, NULL, 0);
-                        prev = msg->flags;
-                        idx = log_next(idx);
-                        seq++;
-                }
-                /* last message fitting into this dump */
-                next_seq = log_next_seq;
-                len = 0;
-                prev = 0;
-                while (len >= 0 && seq < next_seq) {
-                        struct log *msg = log_from_idx(idx);
-                        int textlen;
-                        textlen = msg_print_text(msg, prev, true, text,
-                                                 LOG_LINE_MAX + PREFIX_MAX);
-                        if (textlen < 0) {
-                                len = textlen;
-                                break;
-                        }
-                        idx = log_next(idx);
-                        seq++;
-                        prev = msg->flags;
-                        raw_spin_unlock_irq(&logbuf_lock);
-                        if (copy_to_user(buf + len, text, textlen))
-                                len = -EFAULT;
-                        else
-                                len += textlen;
-                        raw_spin_lock_irq(&logbuf_lock);
-                        if (seq < log_first_seq) {
-                                /* messages are gone, move to next one */
-                                seq = log_first_seq;
-                                idx = log_first_idx;
-                                prev = 0;
-                        }
-                }
-        }
-        if (clear) {
-                clear_seq = log_next_seq;
-                clear_idx = log_next_idx;
-        }
-        raw_spin_unlock_irq(&logbuf_lock);
-        kfree(text);
-        return len;
-}
 int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
-        bool clear = false;
+        unsigned i, j, limit, count;
-        static int saved_console_loglevel = -1;
+        int do_clear = 0;
+        char c;
        int error;
        error = check_syslog_permissions(type, from_file);
@@ -1152,14 +412,28 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                        goto out;
                }
                error = wait_event_interruptible(log_wait,
-                                                 syslog_seq != log_next_seq);
+                                                        (log_start - log_end));
                if (error)
                        goto out;
-                error = syslog_print(buf, len);
+                i = 0;
+                spin_lock_irq(&logbuf_lock);
+                while (!error && (log_start != log_end) && i < len) {
+                        c = LOG_BUF(log_start);
+                        log_start++;
+                        spin_unlock_irq(&logbuf_lock);
+                        error = __put_user(c,buf);
+                        buf++;
+                        i++;
+                        cond_resched();
+                        spin_lock_irq(&logbuf_lock);
+                }
+                spin_unlock_irq(&logbuf_lock);
+                if (!error)
+                        error = i;
                break;
        /* Read/clear last kernel messages */
        case SYSLOG_ACTION_READ_CLEAR:
-                clear = true;
+                do_clear = 1;
                /* FALL THRU */
        /* Read last kernel messages */
        case SYSLOG_ACTION_READ_ALL:
@@ -1173,11 +447,51 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                        error = -EFAULT;
                        goto out;
                }
-                error = syslog_print_all(buf, len, clear);
+                count = len;
+                if (count > log_buf_len)
+                        count = log_buf_len;
+                spin_lock_irq(&logbuf_lock);
+                if (count > logged_chars)
+                        count = logged_chars;
+                if (do_clear)
+                        logged_chars = 0;
+                limit = log_end;
+                /*
+                 * __put_user() could sleep, and while we sleep
+                 * printk() could overwrite the messages
+                 * we try to copy to user space. Therefore
+                 * the messages are copied in reverse. <manfreds>
+                 */
+                for (i = 0; i < count && !error; i++) {
+                        j = limit-1-i;
+                        if (j + log_buf_len < log_end)
+                                break;
+                        c = LOG_BUF(j);
+                        spin_unlock_irq(&logbuf_lock);
+                        error = __put_user(c,&buf[count-1-i]);
+                        cond_resched();
+                        spin_lock_irq(&logbuf_lock);
+                }
+                spin_unlock_irq(&logbuf_lock);
+                if (error)
+                        break;
+                error = i;
+                if (i != count) {
+                        int offset = count-error;
+                        /* buffer overflow during copy, correct user buffer. */
+                        for (i = 0; i < error; i++) {
+                                if (__get_user(c,&buf[i+offset]) ||
+                                    __put_user(c,&buf[i])) {
+                                        error = -EFAULT;
+                                        break;
+                                }
+                                cond_resched();
+                        }
+                }
                break;
        /* Clear ring buffer */
        case SYSLOG_ACTION_CLEAR:
-                syslog_print_all(NULL, 0, true);
+                logged_chars = 0;
                break;
        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_OFF:
@@ -1206,38 +520,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                break;
        /* Number of chars in the log buffer */
        case SYSLOG_ACTION_SIZE_UNREAD:
-                raw_spin_lock_irq(&logbuf_lock);
+                error = log_end - log_start;
-                if (syslog_seq < log_first_seq) {
-                        /* messages are gone, move to first one */
-                        syslog_seq = log_first_seq;
-                        syslog_idx = log_first_idx;
-                        syslog_prev = 0;
-                        syslog_partial = 0;
-                }
-                if (from_file) {
-                        /*
-                         * Short-cut for poll(/"proc/kmsg") which simply checks
-                         * for pending data, not the size; return the count of
-                         * records, not the length.
-                         */
-                        error = log_next_idx - syslog_idx;
-                } else {
-                        u64 seq = syslog_seq;
-                        u32 idx = syslog_idx;
-                        enum log_flags prev = syslog_prev;
-                        error = 0;
-                        while (seq < log_next_seq) {
-                                struct log *msg = log_from_idx(idx);
-                                error += msg_print_text(msg, prev, true, NULL, 0);
-                                idx = log_next(idx);
-                                seq++;
-                                prev = msg->flags;
-                        }
-                        error -= syslog_partial;
-                }
-                raw_spin_unlock_irq(&logbuf_lock);
                break;
        /* Size of the log buffer */
        case SYSLOG_ACTION_SIZE_BUFFER:
@@ -1256,34 +539,189 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
        return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
+#ifdef  CONFIG_KGDB_KDB
+/* kdb dmesg command needs access to the syslog buffer.  do_syslog()
+ * uses locks so it cannot be used during debugging.  Just tell kdb
+ * where the start and end of the physical and logical logs are.  This
+ * is equivalent to do_syslog(3).
+ */
+void kdb_syslog_data(char *syslog_data[4])
+{
+        syslog_data[0] = log_buf;
+        syslog_data[1] = log_buf + log_buf_len;
+        syslog_data[2] = log_buf + log_end -
+                (logged_chars < log_buf_len ? logged_chars : log_buf_len);
+        syslog_data[3] = log_buf + log_end;
+}
+#endif  /* CONFIG_KGDB_KDB */
 /*
- * Call the console drivers, asking them to write out
+ * Call the console drivers on a range of log_buf
- * log_buf[start] to log_buf[end - 1].
- * The console_lock must be held.
 */
-static void call_console_drivers(int level, const char *text, size_t len)
+static void __call_console_drivers(unsigned start, unsigned end)
 {
        struct console *con;
-        trace_console(text, 0, len, len);
-        if (level >= console_loglevel && !ignore_loglevel)
-                return;
-        if (!console_drivers)
-                return;
        for_each_console(con) {
                if (exclusive_console && con != exclusive_console)
                        continue;
-                if (!(con->flags & CON_ENABLED))
+                if ((con->flags & CON_ENABLED) && con->write &&
-                        continue;
+                                (cpu_online(smp_processor_id()) ||
-                if (!con->write)
+                                (con->flags & CON_ANYTIME)))
-                        continue;
+                        con->write(con, &LOG_BUF(start), end - start);
-                if (!cpu_online(smp_processor_id()) &&
+        }
-                    !(con->flags & CON_ANYTIME))
+}
-                        continue;
-                con->write(con, text, len);
+static int __read_mostly ignore_loglevel;
+static int __init ignore_loglevel_setup(char *str)
+{
+        ignore_loglevel = 1;
+        printk(KERN_INFO "debug: ignoring loglevel setting.\n");
+        return 0;
+}
+early_param("ignore_loglevel", ignore_loglevel_setup);
+/*
+ * Write out chars from start to end - 1 inclusive
+ */
+static void _call_console_drivers(unsigned start,
+                                unsigned end, int msg_log_level)
+{
+        if ((msg_log_level < console_loglevel || ignore_loglevel) &&
+                        console_drivers && start != end) {
+                if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
+                        /* wrapped write */
+                        __call_console_drivers(start & LOG_BUF_MASK,
+                                                log_buf_len);
+                        __call_console_drivers(0, end & LOG_BUF_MASK);
+                } else {
+                        __call_console_drivers(start, end);
+                }
+        }
+}
+/*
+ * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
+ * lower 3 bit are the log level, the rest are the log facility. In case
+ * userspace passes usual userspace syslog messages to /dev/kmsg or
+ * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
+ * to extract the correct log level for in-kernel processing, and not mangle
+ * the original value.
+ *
+ * If a prefix is found, the length of the prefix is returned. If 'level' is
+ * passed, it will be filled in with the log level without a possible facility
+ * value. If 'special' is passed, the special printk prefix chars are accepted
+ * and returned. If no valid header is found, 0 is returned and the passed
+ * variables are not touched.
+ */
+static size_t log_prefix(const char *p, unsigned int *level, char *special)
+{
+        unsigned int lev = 0;
+        char sp = '\0';
+        size_t len;
+        if (p[0] != '<' || !p[1])
+                return 0;
+        if (p[2] == '>') {
+                /* usual single digit level number or special char */
+                switch (p[1]) {
+                case '0' ... '7':
+                        lev = p[1] - '0';
+                        break;
+                case 'c': /* KERN_CONT */
+                case 'd': /* KERN_DEFAULT */
+                        sp = p[1];
+                        break;
+                default:
+                        return 0;
+                }
+                len = 3;
+        } else {
+                /* multi digit including the level and facility number */
+                char *endp = NULL;
+                if (p[1] < '0' && p[1] > '9')
+                        return 0;
+                lev = (simple_strtoul(&p[1], &endp, 10) & 7);
+                if (endp == NULL || endp[0] != '>')
+                        return 0;
+                len = (endp + 1) - p;
+        }
+        /* do not accept special char if not asked for */
+        if (sp && !special)
+                return 0;
+        if (special) {
+                *special = sp;
+                /* return special char, do not touch level */
+                if (sp)
+                        return len;
+        }
+        if (level)
+                *level = lev;
+        return len;
+}
+/*
+ * Call the console drivers, asking them to write out
+ * log_buf[start] to log_buf[end - 1].
+ * The console_lock must be held.
+ */
+static void call_console_drivers(unsigned start, unsigned end)
+{
+        unsigned cur_index, start_print;
+        static int msg_level = -1;
+        BUG_ON(((int)(start - end)) > 0);
+        cur_index = start;
+        start_print = start;
+        while (cur_index != end) {
+                if (msg_level < 0 && ((end - cur_index) > 2)) {
+                        /* strip log prefix */
+                        cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
+                        start_print = cur_index;
+                }
+                while (cur_index != end) {
+                        char c = LOG_BUF(cur_index);
+                        cur_index++;
+                        if (c == '\n') {
+                                if (msg_level < 0) {
+                                        /*
+                                         * printk() has already given us loglevel tags in
+                                         * the buffer.  This code is here in case the
+                                         * log buffer has wrapped right round and scribbled
+                                         * on those tags
+                                         */
+                                        msg_level = default_message_loglevel;
+                                }
+                                _call_console_drivers(start_print, cur_index, msg_level);
+                                msg_level = -1;
+                                start_print = cur_index;
+                                break;
+                        }
+                }
        }
+        _call_console_drivers(start_print, end, msg_level);
+}
+static void emit_log_char(char c)
+{
+        LOG_BUF(log_end) = c;
+        log_end++;
+        if (log_end - log_start > log_buf_len)
+                log_start = log_end - log_buf_len;
+        if (log_end - con_start > log_buf_len)
+                con_start = log_end - log_buf_len;
+        if (logged_chars < log_buf_len)
+                logged_chars++;
 }
 /*
@@ -1301,13 +739,19 @@ static void zap_locks(void)
        oops_timestamp = jiffies;
-        debug_locks_off();
        /* If a crash is occurring, make sure we can't deadlock */
-        raw_spin_lock_init(&logbuf_lock);
+        spin_lock_init(&logbuf_lock);
        /* And make sure that we print immediately */
        sema_init(&console_sem, 1);
 }
+#if defined(CONFIG_PRINTK_TIME)
+static int printk_time = 1;
+#else
+static int printk_time = 0;
+#endif
+module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
 /* Check if we have any console registered that can be called early in boot. */
 static int have_callable_console(void)
 {
@@ -1320,6 +764,51 @@ static int have_callable_console(void)
        return 0;
 }
+/**
+ * printk - print a kernel message
+ * @fmt: format string
+ *
+ * This is printk().  It can be called from any context.  We want it to work.
+ *
+ * We try to grab the console_lock.  If we succeed, it's easy - we log the output and
+ * call the console drivers.  If we fail to get the semaphore we place the output
+ * into the log buffer and return.  The current holder of the console_sem will
+ * notice the new output in console_unlock(); and will send it to the
+ * consoles before releasing the lock.
+ *
+ * One effect of this deferred printing is that code which calls printk() and
+ * then changes console_loglevel may break. This is because console_loglevel
+ * is inspected when the actual printing occurs.
+ *
+ * See also:
+ * printf(3)
+ *
+ * See the vsnprintf() documentation for format string extensions over C99.
+ */
+asmlinkage int printk(const char *fmt, ...)
+{
+        va_list args;
+        int r;
+#ifdef CONFIG_KGDB_KDB
+        if (unlikely(kdb_trap_printk)) {
+                va_start(args, fmt);
+                r = vkdb_printf(fmt, args);
+                va_end(args);
+                return r;
+        }
+#endif
+        va_start(args, fmt);
+        r = vprintk(fmt, args);
+        va_end(args);
+        return r;
+}
+/* cpu currently holding logbuf_lock */
+static volatile unsigned int printk_cpu = UINT_MAX;
 /*
 * Can we actually use the console at this time on this cpu?
 *
@@ -1363,12 +852,17 @@ static int console_trylock_for_printk(unsigned int cpu)
                        retval = 0;
                }
        }
-        logbuf_cpu = UINT_MAX;
+        printk_cpu = UINT_MAX;
+        spin_unlock(&logbuf_lock);
        if (wake)
                up(&console_sem);
-        raw_spin_unlock(&logbuf_lock);
        return retval;
 }
+static const char recursion_bug_msg [] =
+                KERN_CRIT "BUG: recent printk recursion!\n";
+static int recursion_bug;
+static int new_text_line = 1;
+static char printk_buf[1024];
 int printk_delay_msec __read_mostly;
@@ -1384,134 +878,28 @@ static inline void printk_delay(void)
        }
 }
-/*
+asmlinkage int vprintk(const char *fmt, va_list args)
- * Continuation lines are buffered, and not committed to the record buffer
- * until the line is complete, or a race forces it. The line fragments
- * though, are printed immediately to the consoles to ensure everything has
- * reached the console in case of a kernel crash.
- */
-static struct cont {
-        char buf[LOG_LINE_MAX];
-        size_t len;                     /* length == 0 means unused buffer */
-        size_t cons;                    /* bytes written to console */
-        struct task_struct *owner;      /* task of first print*/
-        u64 ts_nsec;                    /* time of first print */
-        u8 level;                       /* log level of first message */
-        u8 facility;                    /* log level of first message */
-        enum log_flags flags;           /* prefix, newline flags */
-        bool flushed:1;                 /* buffer sealed and committed */
-} cont;
-static void cont_flush(enum log_flags flags)
-{
-        if (cont.flushed)
-                return;
-        if (cont.len == 0)
-                return;
-        if (cont.cons) {
-                /*
-                 * If a fragment of this line was directly flushed to the
-                 * console; wait for the console to pick up the rest of the
-                 * line. LOG_NOCONS suppresses a duplicated output.
-                 */
-                log_store(cont.facility, cont.level, flags | LOG_NOCONS,
-                          cont.ts_nsec, NULL, 0, cont.buf, cont.len);
-                cont.flags = flags;
-                cont.flushed = true;
-        } else {
-                /*
-                 * If no fragment of this line ever reached the console,
-                 * just submit it to the store and free the buffer.
-                 */
-                log_store(cont.facility, cont.level, flags, 0,
-                          NULL, 0, cont.buf, cont.len);
-                cont.len = 0;
-        }
-}
-static bool cont_add(int facility, int level, const char *text, size_t len)
-{
-        if (cont.len && cont.flushed)
-                return false;
-        if (cont.len + len > sizeof(cont.buf)) {
-                /* the line gets too long, split it up in separate records */
-                cont_flush(LOG_CONT);
-                return false;
-        }
-        if (!cont.len) {
-                cont.facility = facility;
-                cont.level = level;
-                cont.owner = current;
-                cont.ts_nsec = local_clock();
-                cont.flags = 0;
-                cont.cons = 0;
-                cont.flushed = false;
-        }
-        memcpy(cont.buf + cont.len, text, len);
-        cont.len += len;
-        if (cont.len > (sizeof(cont.buf) * 80) / 100)
-                cont_flush(LOG_CONT);
-        return true;
-}
-static size_t cont_print_text(char *text, size_t size)
-{
-        size_t textlen = 0;
-        size_t len;
-        if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
-                textlen += print_time(cont.ts_nsec, text);
-                size -= textlen;
-        }
-        len = cont.len - cont.cons;
-        if (len > 0) {
-                if (len+1 > size)
-                        len = size-1;
-                memcpy(text + textlen, cont.buf + cont.cons, len);
-                textlen += len;
-                cont.cons = cont.len;
-        }
-        if (cont.flushed) {
-                if (cont.flags & LOG_NEWLINE)
-                        text[textlen++] = '\n';
-                /* got everything, release buffer */
-                cont.len = 0;
-        }
-        return textlen;
-}
-asmlinkage int vprintk_emit(int facility, int level,
-                            const char *dict, size_t dictlen,
-                            const char *fmt, va_list args)
 {
-        static int recursion_bug;
+        int printed_len = 0;
-        static char textbuf[LOG_LINE_MAX];
+        int current_log_level = default_message_loglevel;
-        char *text = textbuf;
-        size_t text_len;
-        enum log_flags lflags = 0;
        unsigned long flags;
        int this_cpu;
-        int printed_len = 0;
+        char *p;
+        size_t plen;
+        char special;
-        boot_delay_msec(level);
+        boot_delay_msec();
        printk_delay();
+        preempt_disable();
        /* This stops the holder of console_sem just where we want him */
-        local_irq_save(flags);
+        raw_local_irq_save(flags);
        this_cpu = smp_processor_id();
        /*
         * Ouch, printk recursed into itself!
         */
-        if (unlikely(logbuf_cpu == this_cpu)) {
+        if (unlikely(printk_cpu == this_cpu)) {
                /*
                 * If a crash is occurring during printk() on this CPU,
                 * then try to get the crash message out but make sure
@@ -1519,7 +907,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                 * recursion and return - but flag the recursion so that
                 * it can be printed at the next appropriate moment:
                 */
-                if (!oops_in_progress && !lockdep_recursing(current)) {
+                if (!oops_in_progress) {
                        recursion_bug = 1;
                        goto out_restore_irqs;
                }
@@ -1527,201 +915,123 @@ asmlinkage int vprintk_emit(int facility, int level,
        }
        lockdep_off();
-        raw_spin_lock(&logbuf_lock);
+        spin_lock(&logbuf_lock);
-        logbuf_cpu = this_cpu;
+        printk_cpu = this_cpu;
        if (recursion_bug) {
-                static const char recursion_msg[] =
-                        "BUG: recent printk recursion!";
                recursion_bug = 0;
-                printed_len += strlen(recursion_msg);
+                strcpy(printk_buf, recursion_bug_msg);
-                /* emit KERN_CRIT message */
+                printed_len = strlen(recursion_bug_msg);
-                log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
-                          NULL, 0, recursion_msg, printed_len);
        }
+        /* Emit the output into the temporary buffer */
+        printed_len += vscnprintf(printk_buf + printed_len,
+                                  sizeof(printk_buf) - printed_len, fmt, args);
-        /*
+#ifdef  CONFIG_DEBUG_LL
-         * The printf needs to come first; we need the syslog
+        printascii(printk_buf);
-         * prefix which might be passed-in as a parameter.
+#endif
-         */
-        text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
-        /* mark and strip a trailing newline */
+        p = printk_buf;
-        if (text_len && text[text_len-1] == '\n') {
-                text_len--;
-                lflags |= LOG_NEWLINE;
-        }
-        /* strip kernel syslog prefix and extract log level or control flags */
+        /* Read log level and handle special printk prefix */
-        if (facility == 0) {
+        plen = log_prefix(p, &current_log_level, &special);
-                int kern_level = printk_get_level(text);
+        if (plen) {
+                p += plen;
-                if (kern_level) {
-                        const char *end_of_header = printk_skip_level(text);
+                switch (special) {
-                        switch (kern_level) {
+                case 'c': /* Strip <c> KERN_CONT, continue line */
-                        case '0' ... '7':
+                        plen = 0;
-                                if (level == -1)
+                        break;
-                                        level = kern_level - '0';
+                case 'd': /* Strip <d> KERN_DEFAULT, start new line */
-                        case 'd':       /* KERN_DEFAULT */
+                        plen = 0;
-                                lflags |= LOG_PREFIX;
+                default:
-                        case 'c':       /* KERN_CONT */
+                        if (!new_text_line) {
-                                break;
+                                emit_log_char('\n');
+                                new_text_line = 1;
                        }
-                        text_len -= end_of_header - text;
-                        text = (char *)end_of_header;
                }
        }
-        if (level == -1)
+        /*
-                level = default_message_loglevel;
+         * Copy the output into log_buf. If the caller didn't provide
+         * the appropriate log prefix, we insert them here
-        if (dict)
+         */
-                lflags |= LOG_PREFIX|LOG_NEWLINE;
+        for (; *p; p++) {
+                if (new_text_line) {
-        if (!(lflags & LOG_NEWLINE)) {
+                        new_text_line = 0;
-                /*
-                 * Flush the conflicting buffer. An earlier newline was missing,
+                        if (plen) {
-                 * or another task also prints continuation lines.
+                                /* Copy original log prefix */
-                 */
+                                int i;
-                if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
-                        cont_flush(LOG_NEWLINE);
+                                for (i = 0; i < plen; i++)
+                                        emit_log_char(printk_buf[i]);
+                                printed_len += plen;
+                        } else {
+                                /* Add log prefix */
+                                emit_log_char('<');
+                                emit_log_char(current_log_level + '0');
+                                emit_log_char('>');
+                                printed_len += 3;
+                        }
-                /* buffer line if possible, otherwise store it right away */
+                        if (printk_time) {
-                if (!cont_add(facility, level, text, text_len))
+                                /* Add the current time stamp */
-                        log_store(facility, level, lflags | LOG_CONT, 0,
+                                char tbuf[50], *tp;
-                                  dict, dictlen, text, text_len);
+                                unsigned tlen;
-        } else {
+                                unsigned long long t;
-                bool stored = false;
+                                unsigned long nanosec_rem;
+                                t = cpu_clock(printk_cpu);
+                                nanosec_rem = do_div(t, 1000000000);
+                                tlen = sprintf(tbuf, "[%5lu.%06lu] ",
+                                                (unsigned long) t,
+                                                nanosec_rem / 1000);
+                                for (tp = tbuf; tp < tbuf + tlen; tp++)
+                                        emit_log_char(*tp);
+                                printed_len += tlen;
+                        }
-                /*
+                        if (!*p)
-                 * If an earlier newline was missing and it was the same task,
+                                break;
-                 * either merge it with the current buffer and flush, or if
-                 * there was a race with interrupts (prefix == true) then just
-                 * flush it out and store this line separately.
-                 */
-                if (cont.len && cont.owner == current) {
-                        if (!(lflags & LOG_PREFIX))
-                                stored = cont_add(facility, level, text, text_len);
-                        cont_flush(LOG_NEWLINE);
                }
-                if (!stored)
+                emit_log_char(*p);
-                        log_store(facility, level, lflags, 0,
+                if (*p == '\n')
-                                  dict, dictlen, text, text_len);
+                        new_text_line = 1;
        }
-        printed_len += text_len;
        /*
-         * Try to acquire and then immediately release the console semaphore.
+         * Try to acquire and then immediately release the
-         * The release will print out buffers and wake up /dev/kmsg and syslog()
+         * console semaphore. The release will do all the
-         * users.
+         * actual magic (print out buffers, wake up klogd,
+         * etc). 
         *
-         * The console_trylock_for_printk() function will release 'logbuf_lock'
+         * The console_trylock_for_printk() function
-         * regardless of whether it actually gets the console semaphore or not.
+         * will release 'logbuf_lock' regardless of whether it
+         * actually gets the semaphore or not.
         */
        if (console_trylock_for_printk(this_cpu))
                console_unlock();
        lockdep_on();
 out_restore_irqs:
-        local_irq_restore(flags);
+        raw_local_irq_restore(flags);
+        preempt_enable();
        return printed_len;
 }
-EXPORT_SYMBOL(vprintk_emit);
+EXPORT_SYMBOL(printk);
-asmlinkage int vprintk(const char *fmt, va_list args)
-{
-        return vprintk_emit(0, -1, NULL, 0, fmt, args);
-}
 EXPORT_SYMBOL(vprintk);
-asmlinkage int printk_emit(int facility, int level,
+#else
-                           const char *dict, size_t dictlen,
-                           const char *fmt, ...)
-{
-        va_list args;
-        int r;
-        va_start(args, fmt);
-        r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
-        va_end(args);
-        return r;
-}
-EXPORT_SYMBOL(printk_emit);
-/**
+static void call_console_drivers(unsigned start, unsigned end)
- * printk - print a kernel message
- * @fmt: format string
- *
- * This is printk(). It can be called from any context. We want it to work.
- *
- * We try to grab the console_lock. If we succeed, it's easy - we log the
- * output and call the console drivers.  If we fail to get the semaphore, we
- * place the output into the log buffer and return. The current holder of
- * the console_sem will notice the new output in console_unlock(); and will
- * send it to the consoles before releasing the lock.
- *
- * One effect of this deferred printing is that code which calls printk() and
- * then changes console_loglevel may break. This is because console_loglevel
- * is inspected when the actual printing occurs.
- *
- * See also:
- * printf(3)
- *
- * See the vsnprintf() documentation for format string extensions over C99.
- */
-asmlinkage int printk(const char *fmt, ...)
 {
-        va_list args;
-        int r;
-#ifdef CONFIG_KGDB_KDB
-        if (unlikely(kdb_trap_printk)) {
-                va_start(args, fmt);
-                r = vkdb_printf(fmt, args);
-                va_end(args);
-                return r;
-        }
-#endif
-        va_start(args, fmt);
-        r = vprintk_emit(0, -1, NULL, 0, fmt, args);
-        va_end(args);
-        return r;
 }
-EXPORT_SYMBOL(printk);
-#else /* CONFIG_PRINTK */
+#endif
-#define LOG_LINE_MAX            0
-#define PREFIX_MAX              0
-#define LOG_LINE_MAX 0
-static u64 syslog_seq;
-static u32 syslog_idx;
-static u64 console_seq;
-static u32 console_idx;
-static enum log_flags syslog_prev;
-static u64 log_first_seq;
-static u32 log_first_idx;
-static u64 log_next_seq;
-static enum log_flags console_prev;
-static struct cont {
-        size_t len;
-        size_t cons;
-        u8 level;
-        bool flushed:1;
-} cont;
-static struct log *log_from_idx(u32 idx) { return NULL; }
-static u32 log_next(u32 idx) { return 0; }
-static void call_console_drivers(int level, const char *text, size_t len) {}
-static size_t msg_print_text(const struct log *msg, enum log_flags prev,
-                             bool syslog, char *buf, size_t size) { return 0; }
-static size_t cont_print_text(char *text, size_t size) { return 0; }
-#endif /* CONFIG_PRINTK */
 static int __add_preferred_console(char *name, int idx, char *options,
                                   char *brl_options)
@@ -1844,7 +1154,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
        return -1;
 }
-bool console_suspend_enabled = 1;
+int console_suspend_enabled = 1;
 EXPORT_SYMBOL(console_suspend_enabled);
 static int __init console_suspend_disable(char *str)
@@ -1853,10 +1163,6 @@ static int __init console_suspend_disable(char *str)
        return 1;
 }
 __setup("no_console_suspend", console_suspend_disable);
-module_param_named(console_suspend, console_suspend_enabled,
-                bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
-        " and hibernate operations");
 /**
 * suspend_console - suspend the console subsystem
@@ -1917,14 +1223,12 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
 */
 void console_lock(void)
 {
-        might_sleep();
+        BUG_ON(in_interrupt());
        down(&console_sem);
        if (console_suspended)
                return;
        console_locked = 1;
        console_may_schedule = 1;
-        mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
 }
 EXPORT_SYMBOL(console_lock);
@@ -1946,7 +1250,6 @@ int console_trylock(void)
        }
        console_locked = 1;
        console_may_schedule = 0;
-        mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
        return 1;
 }
 EXPORT_SYMBOL(console_trylock);
@@ -1956,27 +1259,13 @@ int is_console_locked(void)
        return console_locked;
 }
-/*
- * Delayed printk version, for scheduler-internal messages:
- */
-#define PRINTK_BUF_SIZE         512
-#define PRINTK_PENDING_WAKEUP   0x01
-#define PRINTK_PENDING_SCHED    0x02
 static DEFINE_PER_CPU(int, printk_pending);
-static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
 void printk_tick(void)
 {
        if (__this_cpu_read(printk_pending)) {
-                int pending = __this_cpu_xchg(printk_pending, 0);
+                __this_cpu_write(printk_pending, 0);
-                if (pending & PRINTK_PENDING_SCHED) {
+                wake_up_interruptible(&log_wait);
-                        char *buf = __get_cpu_var(printk_sched_buf);
-                        printk(KERN_WARNING "[sched_delayed] %s", buf);
-                }
-                if (pending & PRINTK_PENDING_WAKEUP)
-                        wake_up_interruptible(&log_wait);
        }
 }
@@ -1990,36 +1279,7 @@ int printk_needs_cpu(int cpu)
 void wake_up_klogd(void)
 {
        if (waitqueue_active(&log_wait))
-                this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+                this_cpu_write(printk_pending, 1);
-}
-static void console_cont_flush(char *text, size_t size)
-{
-        unsigned long flags;
-        size_t len;
-        raw_spin_lock_irqsave(&logbuf_lock, flags);
-        if (!cont.len)
-                goto out;
-        /*
-         * We still queue earlier records, likely because the console was
-         * busy. The earlier ones need to be printed before this one, we
-         * did not flush any fragment so far, so just let it queue up.
-         */
-        if (console_seq < log_next_seq && !cont.cons)
-                goto out;
-        len = cont_print_text(text, size);
-        raw_spin_unlock(&logbuf_lock);
-        stop_critical_timings();
-        call_console_drivers(cont.level, text, len);
-        start_critical_timings();
-        local_irq_restore(flags);
-        return;
-out:
-        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 }
 /**
@@ -2032,17 +1292,15 @@ out:
 * by printk().  If this is the case, console_unlock(); emits
 * the output prior to releasing the lock.
 *
- * If there is output waiting, we wake /dev/kmsg and syslog() users.
+ * If there is output waiting for klogd, we wake it up.
 *
 * console_unlock(); may be called from any context.
 */
 void console_unlock(void)
 {
-        static char text[LOG_LINE_MAX + PREFIX_MAX];
-        static u64 seen_seq;
        unsigned long flags;
-        bool wake_klogd = false;
+        unsigned _con_start, _log_end;
-        bool retry;
+        unsigned wake_klogd = 0, retry = 0;
        if (console_suspended) {
                up(&console_sem);
@@ -2051,69 +1309,28 @@ void console_unlock(void)
        console_may_schedule = 0;
-        /* flush buffered message fragment immediately to console */
-        console_cont_flush(text, sizeof(text));
 again:
-        for (;;) {
+        for ( ; ; ) {
-                struct log *msg;
+                spin_lock_irqsave(&logbuf_lock, flags);
-                size_t len;
+                wake_klogd |= log_start - log_end;
-                int level;
+                if (con_start == log_end)
+                        break;                  /* Nothing to print */
-                raw_spin_lock_irqsave(&logbuf_lock, flags);
+                _con_start = con_start;
-                if (seen_seq != log_next_seq) {
+                _log_end = log_end;
-                        wake_klogd = true;
+                con_start = log_end;            /* Flush */
-                        seen_seq = log_next_seq;
+                spin_unlock(&logbuf_lock);
-                }
-                if (console_seq < log_first_seq) {
-                        /* messages are gone, move to first one */
-                        console_seq = log_first_seq;
-                        console_idx = log_first_idx;
-                        console_prev = 0;
-                }
-skip:
-                if (console_seq == log_next_seq)
-                        break;
-                msg = log_from_idx(console_idx);
-                if (msg->flags & LOG_NOCONS) {
-                        /*
-                         * Skip record we have buffered and already printed
-                         * directly to the console when we received it.
-                         */
-                        console_idx = log_next(console_idx);
-                        console_seq++;
-                        /*
-                         * We will get here again when we register a new
-                         * CON_PRINTBUFFER console. Clear the flag so we
-                         * will properly dump everything later.
-                         */
-                        msg->flags &= ~LOG_NOCONS;
-                        console_prev = msg->flags;
-                        goto skip;
-                }
-                level = msg->level;
-                len = msg_print_text(msg, console_prev, false,
-                                     text, sizeof(text));
-                console_idx = log_next(console_idx);
-                console_seq++;
-                console_prev = msg->flags;
-                raw_spin_unlock(&logbuf_lock);
                stop_critical_timings();        /* don't trace print latency */
-                call_console_drivers(level, text, len);
+                call_console_drivers(_con_start, _log_end);
                start_critical_timings();
                local_irq_restore(flags);
        }
        console_locked = 0;
-        mutex_release(&console_lock_dep_map, 1, _RET_IP_);
        /* Release the exclusive_console once it is used */
        if (unlikely(exclusive_console))
                exclusive_console = NULL;
-        raw_spin_unlock(&logbuf_lock);
+        spin_unlock(&logbuf_lock);
        up(&console_sem);
@@ -2123,10 +1340,10 @@ skip:
         * there's a new owner and the console_unlock() from them will do the
         * flush, no worries.
         */
-        raw_spin_lock(&logbuf_lock);
+        spin_lock(&logbuf_lock);
-        retry = console_seq != log_next_seq;
+        if (con_start != log_end)
-        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+                retry = 1;
+        spin_unlock_irqrestore(&logbuf_lock, flags);
        if (retry && console_trylock())
                goto again;
@@ -2359,11 +1576,9 @@ void register_console(struct console *newcon)
                 * console_unlock(); will print out the buffered messages
                 * for us.
                 */
-                raw_spin_lock_irqsave(&logbuf_lock, flags);
+                spin_lock_irqsave(&logbuf_lock, flags);
-                console_seq = syslog_seq;
+                con_start = log_start;
-                console_idx = syslog_idx;
+                spin_unlock_irqrestore(&logbuf_lock, flags);
-                console_prev = syslog_prev;
-                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
                /*
                 * We're about to replay the log buffer.  Only do this to the
                 * just-registered console to avoid excessive message spam to
@@ -2456,26 +1671,6 @@ late_initcall(printk_late_init);
 #if defined CONFIG_PRINTK
-int printk_sched(const char *fmt, ...)
-{
-        unsigned long flags;
-        va_list args;
-        char *buf;
-        int r;
-        local_irq_save(flags);
-        buf = __get_cpu_var(printk_sched_buf);
-        va_start(args, fmt);
-        r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args);
-        va_end(args);
-        __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
-        local_irq_restore(flags);
-        return r;
-}
 /*
 * printk rate limiting, lifted from the networking subsystem.
 *
@@ -2571,263 +1766,47 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
-static bool always_kmsg_dump;
-module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
 /**
 * kmsg_dump - dump kernel log to kernel message dumpers.
 * @reason: the reason (oops, panic etc) for dumping
 *
- * Call each of the registered dumper's dump() callback, which can
+ * Iterate through each of the dump devices and call the oops/panic
- * retrieve the kmsg records with kmsg_dump_get_line() or
+ * callbacks with the log buffer.
- * kmsg_dump_get_buffer().
 */
 void kmsg_dump(enum kmsg_dump_reason reason)
 {
+        unsigned long end;
+        unsigned chars;
        struct kmsg_dumper *dumper;
+        const char *s1, *s2;
+        unsigned long l1, l2;
        unsigned long flags;
-        if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
+        /* Theoretically, the log could move on after we do this, but
-                return;
+           there's not a lot we can do about that. The new messages
+           will overwrite the start of what we dump. */
-        rcu_read_lock();
+        spin_lock_irqsave(&logbuf_lock, flags);
-        list_for_each_entry_rcu(dumper, &dump_list, list) {
+        end = log_end & LOG_BUF_MASK;
-                if (dumper->max_reason && reason > dumper->max_reason)
+        chars = logged_chars;
-                        continue;
+        spin_unlock_irqrestore(&logbuf_lock, flags);
-                /* initialize iterator with data about the stored records */
-                dumper->active = true;
-                raw_spin_lock_irqsave(&logbuf_lock, flags);
+        if (chars > end) {
-                dumper->cur_seq = clear_seq;
+                s1 = log_buf + log_buf_len - chars + end;
-                dumper->cur_idx = clear_idx;
+                l1 = chars - end;
-                dumper->next_seq = log_next_seq;
-                dumper->next_idx = log_next_idx;
-                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-                /* invoke dumper which will iterate over records */
-                dumper->dump(dumper, reason);
-                /* reset iterator */
-                dumper->active = false;
-        }
-        rcu_read_unlock();
-}
-/**
- * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
- * @dumper: registered kmsg dumper
- * @syslog: include the "<4>" prefixes
- * @line: buffer to copy the line to
- * @size: maximum size of the buffer
- * @len: length of line placed into buffer
- *
- * Start at the beginning of the kmsg buffer, with the oldest kmsg
- * record, and copy one record into the provided buffer.
- *
- * Consecutive calls will return the next available record moving
- * towards the end of the buffer with the youngest messages.
- *
- * A return value of FALSE indicates that there are no more records to
- * read.
- *
- * The function is similar to kmsg_dump_get_line(), but grabs no locks.
- */
-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
-                               char *line, size_t size, size_t *len)
-{
-        struct log *msg;
-        size_t l = 0;
-        bool ret = false;
-        if (!dumper->active)
-                goto out;
-        if (dumper->cur_seq < log_first_seq) {
+                s2 = log_buf;
-                /* messages are gone, move to first available one */
+                l2 = end;
-                dumper->cur_seq = log_first_seq;
+        } else {
-                dumper->cur_idx = log_first_idx;
+                s1 = "";
-        }
+                l1 = 0;
-        /* last entry */
-        if (dumper->cur_seq >= log_next_seq)
-                goto out;
-        msg = log_from_idx(dumper->cur_idx);
-        l = msg_print_text(msg, 0, syslog, line, size);
-        dumper->cur_idx = log_next(dumper->cur_idx);
-        dumper->cur_seq++;
-        ret = true;
-out:
-        if (len)
-                *len = l;
-        return ret;
-}
-/**
- * kmsg_dump_get_line - retrieve one kmsg log line
- * @dumper: registered kmsg dumper
- * @syslog: include the "<4>" prefixes
- * @line: buffer to copy the line to
- * @size: maximum size of the buffer
- * @len: length of line placed into buffer
- *
- * Start at the beginning of the kmsg buffer, with the oldest kmsg
- * record, and copy one record into the provided buffer.
- *
- * Consecutive calls will return the next available record moving
- * towards the end of the buffer with the youngest messages.
- *
- * A return value of FALSE indicates that there are no more records to
- * read.
- */
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
-                        char *line, size_t size, size_t *len)
-{
-        unsigned long flags;
-        bool ret;
-        raw_spin_lock_irqsave(&logbuf_lock, flags);
-        ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
-        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
-/**
- * kmsg_dump_get_buffer - copy kmsg log lines
- * @dumper: registered kmsg dumper
- * @syslog: include the "<4>" prefixes
- * @buf: buffer to copy the line to
- * @size: maximum size of the buffer
- * @len: length of line placed into buffer
- *
- * Start at the end of the kmsg buffer and fill the provided buffer
- * with as many of the the *youngest* kmsg records that fit into it.
- * If the buffer is large enough, all available kmsg records will be
- * copied with a single call.
- *
- * Consecutive calls will fill the buffer with the next block of
- * available older records, not including the earlier retrieved ones.
- *
- * A return value of FALSE indicates that there are no more records to
- * read.
- */
-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
-                          char *buf, size_t size, size_t *len)
-{
-        unsigned long flags;
-        u64 seq;
-        u32 idx;
-        u64 next_seq;
-        u32 next_idx;
-        enum log_flags prev;
-        size_t l = 0;
-        bool ret = false;
-        if (!dumper->active)
-                goto out;
-        raw_spin_lock_irqsave(&logbuf_lock, flags);
-        if (dumper->cur_seq < log_first_seq) {
-                /* messages are gone, move to first available one */
-                dumper->cur_seq = log_first_seq;
-                dumper->cur_idx = log_first_idx;
-        }
-        /* last entry */
-        if (dumper->cur_seq >= dumper->next_seq) {
-                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-                goto out;
-        }
-        /* calculate length of entire buffer */
-        seq = dumper->cur_seq;
-        idx = dumper->cur_idx;
-        prev = 0;
-        while (seq < dumper->next_seq) {
-                struct log *msg = log_from_idx(idx);
-                l += msg_print_text(msg, prev, true, NULL, 0);
-                idx = log_next(idx);
-                seq++;
-                prev = msg->flags;
-        }
-        /* move first record forward until length fits into the buffer */
-        seq = dumper->cur_seq;
-        idx = dumper->cur_idx;
-        prev = 0;
-        while (l > size && seq < dumper->next_seq) {
-                struct log *msg = log_from_idx(idx);
-                l -= msg_print_text(msg, prev, true, NULL, 0);
-                idx = log_next(idx);
-                seq++;
-                prev = msg->flags;
-        }
-        /* last message in next interation */
-        next_seq = seq;
-        next_idx = idx;
-        l = 0;
-        prev = 0;
-        while (seq < dumper->next_seq) {
-                struct log *msg = log_from_idx(idx);
-                l += msg_print_text(msg, prev, syslog, buf + l, size - l);
+                s2 = log_buf + end - chars;
-                idx = log_next(idx);
+                l2 = chars;
-                seq++;
-                prev = msg->flags;
        }
-        dumper->next_seq = next_seq;
+        rcu_read_lock();
-        dumper->next_idx = next_idx;
+        list_for_each_entry_rcu(dumper, &dump_list, list)
-        ret = true;
+                dumper->dump(dumper, reason, s1, l1, s2, l2);
-        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+        rcu_read_unlock();
-out:
-        if (len)
-                *len = l;
-        return ret;
-}
-EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
-/**
- * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
- * @dumper: registered kmsg dumper
- *
- * Reset the dumper's iterator so that kmsg_dump_get_line() and
- * kmsg_dump_get_buffer() can be called again and used multiple
- * times within the same dumper.dump() callback.
- *
- * The function is similar to kmsg_dump_rewind(), but grabs no locks.
- */
-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
-{
-        dumper->cur_seq = clear_seq;
-        dumper->cur_idx = clear_idx;
-        dumper->next_seq = log_next_seq;
-        dumper->next_idx = log_next_idx;
-}
-/**
- * kmsg_dump_rewind - reset the interator
- * @dumper: registered kmsg dumper
- *
- * Reset the dumper's iterator so that kmsg_dump_get_line() and
- * kmsg_dump_get_buffer() can be called again and used multiple
- * times within the same dumper.dump() callback.
- */
-void kmsg_dump_rewind(struct kmsg_dumper *dumper)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave(&logbuf_lock, flags);
-        kmsg_dump_rewind_nolock(dumper);
-        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 }
-EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
 #endif
diff --git a/kernel/profile.c b/kernel/profile.c
index 1f391819c42..961b389fe52 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -8,13 +8,12 @@
 *  Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
 *      Red Hat, July 2004
 *  Consolidation of architecture support code for profiling,
- *      Nadia Yvette Chambers, Oracle, July 2004
+ *      William Irwin, Oracle, July 2004
 *  Amortized hit count accounting via per-cpu open-addressed hashtables
- *      to resolve timer interrupt livelocks, Nadia Yvette Chambers,
+ *      to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
- *      Oracle, 2004
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/profile.h>
 #include <linux/bootmem.h>
 #include <linux/notifier.h>
@@ -257,7 +256,7 @@ EXPORT_SYMBOL_GPL(unregister_timer_hook);
 * pagetable hash functions, but uses a full hashtable full of finite
 * collision chains, not just pairs of them.
 *
- * -- nyc
+ * -- wli
 */
 static void __profile_flip_buffers(void *unused)
 {
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1599157336a..67d1fdd3c55 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -8,7 +8,7 @@
 */
 #include <linux/capability.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
@@ -172,16 +172,7 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
        return ret;
 }
-static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+int __ptrace_may_access(struct task_struct *task, unsigned int mode)
-{
-        if (mode & PTRACE_MODE_NOAUDIT)
-                return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
-        else
-                return has_ns_capability(current, ns, CAP_SYS_PTRACE);
-}
-/* Returns 0 on success, -errno on denial. */
-static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
        const struct cred *cred = current_cred(), *tcred;
@@ -199,14 +190,15 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
                return 0;
        rcu_read_lock();
        tcred = __task_cred(task);
-        if (uid_eq(cred->uid, tcred->euid) &&
+        if (cred->user->user_ns == tcred->user->user_ns &&
-            uid_eq(cred->uid, tcred->suid) &&
+            (cred->uid == tcred->euid &&
-            uid_eq(cred->uid, tcred->uid)  &&
+             cred->uid == tcred->suid &&
-            gid_eq(cred->gid, tcred->egid) &&
+             cred->uid == tcred->uid  &&
-            gid_eq(cred->gid, tcred->sgid) &&
+             cred->gid == tcred->egid &&
-            gid_eq(cred->gid, tcred->gid))
+             cred->gid == tcred->sgid &&
+             cred->gid == tcred->gid))
                goto ok;
-        if (ptrace_has_cap(tcred->user_ns, mode))
+        if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
                goto ok;
        rcu_read_unlock();
        return -EPERM;
@@ -215,12 +207,8 @@ ok:
        smp_rmb();
        if (task->mm)
                dumpable = get_dumpable(task->mm);
-        rcu_read_lock();
+        if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
-        if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
-                rcu_read_unlock();
                return -EPERM;
-        }
-        rcu_read_unlock();
        return security_ptrace_access_check(task, mode);
 }
@@ -235,22 +223,26 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 }
 static int ptrace_attach(struct task_struct *task, long request,
-                         unsigned long addr,
                         unsigned long flags)
 {
        bool seize = (request == PTRACE_SEIZE);
        int retval;
+        /*
+         * SEIZE will enable new ptrace behaviors which will be implemented
+         * gradually.  SEIZE_DEVEL is used to prevent applications
+         * expecting full SEIZE behaviors trapping on kernel commits which
+         * are still in the process of implementing them.
+         *
+         * Only test programs for new ptrace behaviors being implemented
+         * should set SEIZE_DEVEL.  If unset, SEIZE will fail with -EIO.
+         *
+         * Once SEIZE behaviors are completely implemented, this flag and
+         * the following test will be removed.
+         */
        retval = -EIO;
-        if (seize) {
+        if (seize && !(flags & PTRACE_SEIZE_DEVEL))
-                if (addr != 0)
+                goto out;
-                        goto out;
-                if (flags & ~(unsigned long)PTRACE_O_MASK)
-                        goto out;
-                flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
-        } else {
-                flags = PT_PTRACED;
-        }
        audit_ptrace(task);
@@ -262,7 +254,7 @@ static int ptrace_attach(struct task_struct *task, long request,
        /*
         * Protect exec's credential calculations against our interference;
-         * SUID, SGID and LSM creds get determined differently
+         * interference; SUID, SGID and LSM creds get determined differently
         * under ptrace.
         */
        retval = -ERESTARTNOINTR;
@@ -282,13 +274,11 @@ static int ptrace_attach(struct task_struct *task, long request,
        if (task->ptrace)
                goto unlock_tasklist;
+        task->ptrace = PT_PTRACED;
        if (seize)
-                flags |= PT_SEIZED;
+                task->ptrace |= PT_SEIZED;
-        rcu_read_lock();
+        if (task_ns_capable(task, CAP_SYS_PTRACE))
-        if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
+                task->ptrace |= PT_PTRACE_CAP;
-                flags |= PT_PTRACE_CAP;
-        rcu_read_unlock();
-        task->ptrace = flags;
        __ptrace_link(task, current);
@@ -463,9 +453,6 @@ void exit_ptrace(struct task_struct *tracer)
                return;
        list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
-                if (unlikely(p->ptrace & PT_EXITKILL))
-                        send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
                if (__ptrace_detach(tracer, p))
                        list_add(&p->ptrace_entry, &ptrace_dead);
        }
@@ -533,18 +520,30 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
 static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 {
-        unsigned flags;
+        child->ptrace &= ~PT_TRACE_MASK;
-        if (data & ~(unsigned long)PTRACE_O_MASK)
+        if (data & PTRACE_O_TRACESYSGOOD)
-                return -EINVAL;
+                child->ptrace |= PT_TRACESYSGOOD;
-        /* Avoid intermediate state when all opts are cleared */
+        if (data & PTRACE_O_TRACEFORK)
-        flags = child->ptrace;
+                child->ptrace |= PT_TRACE_FORK;
-        flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
-        flags |= (data << PT_OPT_FLAG_SHIFT);
-        child->ptrace = flags;
-        return 0;
+        if (data & PTRACE_O_TRACEVFORK)
+                child->ptrace |= PT_TRACE_VFORK;
+        if (data & PTRACE_O_TRACECLONE)
+                child->ptrace |= PT_TRACE_CLONE;
+        if (data & PTRACE_O_TRACEEXEC)
+                child->ptrace |= PT_TRACE_EXEC;
+        if (data & PTRACE_O_TRACEVFORKDONE)
+                child->ptrace |= PT_TRACE_VFORK_DONE;
+        if (data & PTRACE_O_TRACEEXIT)
+                child->ptrace |= PT_TRACE_EXIT;
+        return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
 }
 static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
@@ -884,7 +883,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
        }
        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-                ret = ptrace_attach(child, request, addr, data);
+                ret = ptrace_attach(child, request, data);
                /*
                 * Some architectures need to do book-keeping after
                 * a ptrace attach.
@@ -1027,7 +1026,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
        }
        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-                ret = ptrace_attach(child, request, addr, data);
+                ret = ptrace_attach(child, request, data);
                /*
                 * Some architectures need to do book-keeping after
                 * a ptrace attach.
diff --git a/kernel/range.c b/kernel/range.c
index 9b8ae2d6ed6..37fa9b99ad5 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -1,7 +1,7 @@
 /*
 * Range add and subtract
 */
-#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sort.h>
diff --git a/kernel/rcu.h b/kernel/rcu.h
deleted file mode 100644
index 20dfba576c2..00000000000
--- a/kernel/rcu.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Read-Copy Update definitions shared among RCU implementations.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2011
- *
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-#ifndef __LINUX_RCU_H
-#define __LINUX_RCU_H
-#ifdef CONFIG_RCU_TRACE
-#define RCU_TRACE(stmt) stmt
-#else /* #ifdef CONFIG_RCU_TRACE */
-#define RCU_TRACE(stmt)
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-/*
- * Process-level increment to ->dynticks_nesting field.  This allows for
- * architectures that use half-interrupts and half-exceptions from
- * process context.
- *
- * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH
- * that counts the number of process-based reasons why RCU cannot
- * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE
- * is the value used to increment or decrement this field.
- *
- * The rest of the bits could in principle be used to count interrupts,
- * but this would mean that a negative-one value in the interrupt
- * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field.
- * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK
- * that is set to DYNTICK_TASK_FLAG upon initial exit from idle.
- * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon
- * initial exit from idle.
- */
-#define DYNTICK_TASK_NEST_WIDTH 7
-#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1)
-#define DYNTICK_TASK_NEST_MASK  (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1)
-#define DYNTICK_TASK_FLAG          ((DYNTICK_TASK_NEST_VALUE / 8) * 2)
-#define DYNTICK_TASK_MASK          ((DYNTICK_TASK_NEST_VALUE / 8) * 3)
-#define DYNTICK_TASK_EXIT_IDLE     (DYNTICK_TASK_NEST_VALUE + \
-                                    DYNTICK_TASK_FLAG)
-/*
- * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
- * by call_rcu() and rcu callback execution, and are therefore not part of the
- * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
- */
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-# define STATE_RCU_HEAD_READY   0
-# define STATE_RCU_HEAD_QUEUED  1
-extern struct debug_obj_descr rcuhead_debug_descr;
-static inline void debug_rcu_head_queue(struct rcu_head *head)
-{
-        debug_object_activate(head, &rcuhead_debug_descr);
-        debug_object_active_state(head, &rcuhead_debug_descr,
-                                  STATE_RCU_HEAD_READY,
-                                  STATE_RCU_HEAD_QUEUED);
-}
-static inline void debug_rcu_head_unqueue(struct rcu_head *head)
-{
-        debug_object_active_state(head, &rcuhead_debug_descr,
-                                  STATE_RCU_HEAD_QUEUED,
-                                  STATE_RCU_HEAD_READY);
-        debug_object_deactivate(head, &rcuhead_debug_descr);
-}
-#else   /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-static inline void debug_rcu_head_queue(struct rcu_head *head)
-{
-}
-static inline void debug_rcu_head_unqueue(struct rcu_head *head)
-{
-}
-#endif  /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-extern void kfree(const void *);
-static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
-{
-        unsigned long offset = (unsigned long)head->func;
-        if (__is_kfree_rcu_offset(offset)) {
-                RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
-                kfree((void *)head - offset);
-                return 1;
-        } else {
-                RCU_TRACE(trace_rcu_invoke_callback(rn, head));
-                head->func(head);
-                return 0;
-        }
-}
-extern int rcu_expedited;
-#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a2cf76177b4..ddddb320be6 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -43,92 +43,8 @@
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
-#include <linux/export.h>
-#include <linux/hardirq.h>
-#include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/hardirq.h>
-#define CREATE_TRACE_POINTS
-#include <trace/events/rcu.h>
-#include "rcu.h"
-module_param(rcu_expedited, int, 0);
-#ifdef CONFIG_PREEMPT_RCU
-/*
- * Preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
-        current->rcu_read_lock_nesting++;
-        barrier();  /* critical section after entry code. */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-/*
- * Preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
-        struct task_struct *t = current;
-        if (t->rcu_read_lock_nesting != 1) {
-                --t->rcu_read_lock_nesting;
-        } else {
-                barrier();  /* critical section before exit code. */
-                t->rcu_read_lock_nesting = INT_MIN;
-#ifdef CONFIG_PROVE_RCU_DELAY
-                udelay(10); /* Make preemption more probable. */
-#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
-                barrier();  /* assign before ->rcu_read_unlock_special load */
-                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
-                        rcu_read_unlock_special(t);
-                barrier();  /* ->rcu_read_unlock_special load before assign */
-                t->rcu_read_lock_nesting = 0;
-        }
-#ifdef CONFIG_PROVE_LOCKING
-        {
-                int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
-                WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
-        }
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-/*
- * Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so.  No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
-        struct task_struct *t = current;
-        if (likely(list_empty(&current->rcu_node_entry)))
-                return;
-        t->rcu_read_lock_nesting = 1;
-        barrier();
-        t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
-        __rcu_read_unlock();
-}
-#else /* #ifdef CONFIG_PREEMPT_RCU */
-void exit_rcu(void)
-{
-}
-#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
@@ -167,34 +83,22 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
 * section.
 *
 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
- *
- * Note that rcu_read_lock() is disallowed if the CPU is either idle or
- * offline from an RCU perspective, so check for those as well.
 */
 int rcu_read_lock_bh_held(void)
 {
        if (!debug_lockdep_rcu_enabled())
                return 1;
-        if (rcu_is_cpu_idle())
-                return 0;
-        if (!rcu_lockdep_current_cpu_online())
-                return 0;
        return in_softirq() || irqs_disabled();
 }
 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-struct rcu_synchronize {
-        struct rcu_head head;
-        struct completion completion;
-};
 /*
 * Awaken the corresponding synchronize_rcu() instance now that a
 * grace period has elapsed.
 */
-static void wakeme_after_rcu(struct rcu_head  *head)
+void wakeme_after_rcu(struct rcu_head  *head)
 {
        struct rcu_synchronize *rcu;
@@ -202,20 +106,6 @@ static void wakeme_after_rcu(struct rcu_head  *head)
        complete(&rcu->completion);
 }
-void wait_rcu_gp(call_rcu_func_t crf)
-{
-        struct rcu_synchronize rcu;
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        crf(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(wait_rcu_gp);
 #ifdef CONFIG_PROVE_RCU
 /*
 * wrapper function to avoid #include problems.
@@ -402,13 +292,3 @@ struct debug_obj_descr rcuhead_debug_descr = {
 };
 EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
-void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
-{
-        trace_rcu_torture_read(rcutorturename, rhp);
-}
-EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
-#else
-#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
-#endif
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e7dce58f9c2..7bbac7d0f5a 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -22,12 +22,13 @@
 * For detailed explanation of Read-Copy Update mechanism see -
 *              Documentation/RCU
 */
+#include <linux/moduleparam.h>
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/kernel.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/types.h>
@@ -36,167 +37,47 @@
 #include <linux/cpu.h>
 #include <linux/prefetch.h>
-#ifdef CONFIG_RCU_TRACE
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
-#include <trace/events/rcu.h>
+static struct task_struct *rcu_kthread_task;
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
-#include "rcu.h"
 /* Forward declarations for rcutiny_plugin.h. */
 struct rcu_ctrlblk;
-static void invoke_rcu_callbacks(void);
+static void invoke_rcu_kthread(void);
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
-static void rcu_process_callbacks(struct softirq_action *unused);
+static int rcu_kthread(void *arg);
 static void __call_rcu(struct rcu_head *head,
                       void (*func)(struct rcu_head *rcu),
                       struct rcu_ctrlblk *rcp);
 #include "rcutiny_plugin.h"
-static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+#ifdef CONFIG_NO_HZ
-/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
-static void rcu_idle_enter_common(long long newval)
-{
-        if (newval) {
-                RCU_TRACE(trace_rcu_dyntick("--=",
-                                            rcu_dynticks_nesting, newval));
-                rcu_dynticks_nesting = newval;
-                return;
-        }
-        RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval));
-        if (!is_idle_task(current)) {
-                struct task_struct *idle = idle_task(smp_processor_id());
-                RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
-                                            rcu_dynticks_nesting, newval));
-                ftrace_dump(DUMP_ALL);
-                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
-                          current->pid, current->comm,
-                          idle->pid, idle->comm); /* must be idle task! */
-        }
-        rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
-        barrier();
-        rcu_dynticks_nesting = newval;
-}
-/*
- * Enter idle, which is an extended quiescent state if we have fully
- * entered that mode (i.e., if the new value of dynticks_nesting is zero).
- */
-void rcu_idle_enter(void)
-{
-        unsigned long flags;
-        long long newval;
-        local_irq_save(flags);
-        WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
-        if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
-            DYNTICK_TASK_NEST_VALUE)
-                newval = 0;
-        else
-                newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
-        rcu_idle_enter_common(newval);
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-/*
- * Exit an interrupt handler towards idle.
- */
-void rcu_irq_exit(void)
-{
-        unsigned long flags;
-        long long newval;
-        local_irq_save(flags);
-        newval = rcu_dynticks_nesting - 1;
-        WARN_ON_ONCE(newval < 0);
-        rcu_idle_enter_common(newval);
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(rcu_irq_exit);
-/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
+static long rcu_dynticks_nesting = 1;
-static void rcu_idle_exit_common(long long oldval)
-{
-        if (oldval) {
-                RCU_TRACE(trace_rcu_dyntick("++=",
-                                            oldval, rcu_dynticks_nesting));
-                return;
-        }
-        RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
-        if (!is_idle_task(current)) {
-                struct task_struct *idle = idle_task(smp_processor_id());
-                RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
-                          oldval, rcu_dynticks_nesting));
-                ftrace_dump(DUMP_ALL);
-                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
-                          current->pid, current->comm,
-                          idle->pid, idle->comm); /* must be idle task! */
-        }
-}
 /*
- * Exit idle, so that we are no longer in an extended quiescent state.
+ * Enter dynticks-idle mode, which is an extended quiescent state
+ * if we have fully entered that mode (i.e., if the new value of
+ * dynticks_nesting is zero).
 */
-void rcu_idle_exit(void)
+void rcu_enter_nohz(void)
 {
-        unsigned long flags;
+        if (--rcu_dynticks_nesting == 0)
-        long long oldval;
+                rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
-        local_irq_save(flags);
-        oldval = rcu_dynticks_nesting;
-        WARN_ON_ONCE(rcu_dynticks_nesting < 0);
-        if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
-                rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
-        else
-                rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-        rcu_idle_exit_common(oldval);
-        local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
 /*
- * Enter an interrupt handler, moving away from idle.
+ * Exit dynticks-idle mode, so that we are no longer in an extended
+ * quiescent state.
 */
-void rcu_irq_enter(void)
+void rcu_exit_nohz(void)
 {
-        unsigned long flags;
-        long long oldval;
-        local_irq_save(flags);
-        oldval = rcu_dynticks_nesting;
        rcu_dynticks_nesting++;
-        WARN_ON_ONCE(rcu_dynticks_nesting == 0);
-        rcu_idle_exit_common(oldval);
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(rcu_irq_enter);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-/*
- * Test whether RCU thinks that the current CPU is idle.
- */
-int rcu_is_cpu_idle(void)
-{
-        return !rcu_dynticks_nesting;
 }
-EXPORT_SYMBOL(rcu_is_cpu_idle);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* #ifdef CONFIG_NO_HZ */
-/*
- * Test whether the current CPU was interrupted from idle.  Nested
- * interrupts don't count, we must be running at the first interrupt
- * level.
- */
-int rcu_is_cpu_rrupt_from_idle(void)
-{
-        return rcu_dynticks_nesting <= 1;
-}
 /*
 * Helper function for rcu_sched_qs() and rcu_bh_qs().
@@ -215,6 +96,16 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 }
 /*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+        have_rcu_kthread_work = 1;
+        wake_up(&rcu_kthread_wq);
+}
+/*
 * Record an rcu quiescent state.  And an rcu_bh quiescent state while we
 * are at it, given that any rcu quiescent state is also an rcu_bh
 * quiescent state.  Use "+" instead of "||" to defeat short circuiting.
@@ -226,7 +117,7 @@ void rcu_sched_qs(int cpu)
        local_irq_save(flags);
        if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
            rcu_qsctr_help(&rcu_bh_ctrlblk))
-                invoke_rcu_callbacks();
+                invoke_rcu_kthread();
        local_irq_restore(flags);
 }
@@ -239,19 +130,20 @@ void rcu_bh_qs(int cpu)
        local_irq_save(flags);
        if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-                invoke_rcu_callbacks();
+                invoke_rcu_kthread();
        local_irq_restore(flags);
 }
 /*
 * Check to see if the scheduling-clock interrupt came from an extended
- * quiescent state, and, if so, tell RCU about it.  This function must
+ * quiescent state, and, if so, tell RCU about it.
- * be called from hardirq context.  It is normally called from the
- * scheduling-clock interrupt.
 */
 void rcu_check_callbacks(int cpu, int user)
 {
-        if (user || rcu_is_cpu_rrupt_from_idle())
+        if (user ||
+            (idle_cpu(cpu) &&
+             !in_softirq() &&
+             hardirq_count() <= (1 << HARDIRQ_SHIFT)))
                rcu_sched_qs(cpu);
        else if (!in_softirq())
                rcu_bh_qs(cpu);
@@ -262,27 +154,18 @@ void rcu_check_callbacks(int cpu, int user)
 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
 * whose grace period has elapsed.
 */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
-        char *rn = NULL;
        struct rcu_head *next, *list;
        unsigned long flags;
        RCU_TRACE(int cb_count = 0);
        /* If no RCU callbacks ready to invoke, just return. */
-        if (&rcp->rcucblist == rcp->donetail) {
+        if (&rcp->rcucblist == rcp->donetail)
-                RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
-                RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
-                                              ACCESS_ONCE(rcp->rcucblist),
-                                              need_resched(),
-                                              is_idle_task(current),
-                                              rcu_is_callbacks_kthread()));
                return;
-        }
        /* Move the ready-to-invoke callbacks to a local list. */
        local_irq_save(flags);
-        RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
        list = rcp->rcucblist;
        rcp->rcucblist = *rcp->donetail;
        *rcp->donetail = NULL;
@@ -293,28 +176,49 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        local_irq_restore(flags);
        /* Invoke the callbacks on the local list. */
-        RCU_TRACE(rn = rcp->name);
        while (list) {
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
                local_bh_disable();
-                __rcu_reclaim(rn, list);
+                __rcu_reclaim(list);
                local_bh_enable();
                list = next;
                RCU_TRACE(cb_count++);
        }
        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
-        RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
-                                      is_idle_task(current),
-                                      rcu_is_callbacks_kthread()));
 }
-static void rcu_process_callbacks(struct softirq_action *unused)
+/*
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
+ */
+static int rcu_kthread(void *arg)
 {
-        __rcu_process_callbacks(&rcu_sched_ctrlblk);
+        unsigned long work;
-        __rcu_process_callbacks(&rcu_bh_ctrlblk);
+        unsigned long morework;
-        rcu_preempt_process_callbacks();
+        unsigned long flags;
+        for (;;) {
+                wait_event_interruptible(rcu_kthread_wq,
+                                         have_rcu_kthread_work != 0);
+                morework = rcu_boost();
+                local_irq_save(flags);
+                work = have_rcu_kthread_work;
+                have_rcu_kthread_work = morework;
+                local_irq_restore(flags);
+                if (work) {
+                        rcu_process_callbacks(&rcu_sched_ctrlblk);
+                        rcu_process_callbacks(&rcu_bh_ctrlblk);
+                        rcu_preempt_process_callbacks();
+                }
+                schedule_timeout_interruptible(1); /* Leave CPU for others. */
+        }
+        return 0;  /* Not reached, but needed to shut gcc up. */
 }
 /*
@@ -332,10 +236,6 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 */
 void synchronize_sched(void)
 {
-        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-                           !lock_is_held(&rcu_lock_map) &&
-                           !lock_is_held(&rcu_sched_lock_map),
-                           "Illegal synchronize_sched() in RCU read-side critical section");
        cond_resched();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
@@ -380,3 +280,45 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
        __call_rcu(head, func, &rcu_bh_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
+void rcu_barrier_bh(void)
+{
+        struct rcu_synchronize rcu;
+        init_rcu_head_on_stack(&rcu.head);
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        call_rcu_bh(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+void rcu_barrier_sched(void)
+{
+        struct rcu_synchronize rcu;
+        init_rcu_head_on_stack(&rcu.head);
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        call_rcu_sched(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+        struct sched_param sp;
+        rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+        sp.sched_priority = RCU_BOOST_PRIO;
+        sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+        return 0;
+}
+early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f85016a2309..f259c676195 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -23,30 +23,32 @@
 */
 #include <linux/kthread.h>
-#include <linux/module.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
        struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
        struct rcu_head **curtail;      /* ->next pointer of last CB. */
        RCU_TRACE(long qlen);           /* Number of pending CBs. */
-        RCU_TRACE(char *name);          /* Name of RCU type. */
 };
 /* Definition for rcupdate control block. */
 static struct rcu_ctrlblk rcu_sched_ctrlblk = {
        .donetail       = &rcu_sched_ctrlblk.rcucblist,
        .curtail        = &rcu_sched_ctrlblk.rcucblist,
-        RCU_TRACE(.name = "rcu_sched")
 };
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
        .donetail       = &rcu_bh_ctrlblk.rcucblist,
        .curtail        = &rcu_bh_ctrlblk.rcucblist,
-        RCU_TRACE(.name = "rcu_bh")
 };
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -129,7 +131,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
        .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
        .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
        .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
-        RCU_TRACE(.rcb.name = "rcu_preempt")
 };
 static int rcu_preempted_readers_exp(void);
@@ -146,16 +147,6 @@ static int rcu_cpu_blocking_cur_gp(void)
 /*
 * Check for a running RCU reader.  Because there is only one CPU,
 * there can be but one running RCU reader at a time.  ;-)
- *
- * Returns zero if there are no running readers.  Returns a positive
- * number if there is at least one reader within its RCU read-side
- * critical section.  Returns a negative number if an outermost reader
- * is in the midst of exiting from its RCU read-side critical section
- *
- * Returns zero if there are no running readers.  Returns a positive
- * number if there is at least one reader within its RCU read-side
- * critical section.  Returns a negative number if an outermost reader
- * is in the midst of exiting from its RCU read-side critical section.
 */
 static int rcu_preempt_running_reader(void)
 {
@@ -256,13 +247,6 @@ static void show_tiny_preempt_stats(struct seq_file *m)
 #include "rtmutex_common.h"
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-/* Controls for rcu_kthread() kthread. */
-static struct task_struct *rcu_kthread_task;
-static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
-static unsigned long have_rcu_kthread_work;
 /*
 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
@@ -278,7 +262,7 @@ static int rcu_boost(void)
            rcu_preempt_ctrlblk.exp_tasks == NULL)
                return 0;  /* Nothing to boost. */
-        local_irq_save(flags);
+        raw_local_irq_save(flags);
        /*
         * Recheck with irqs disabled: all tasks in need of boosting
@@ -287,7 +271,7 @@ static int rcu_boost(void)
         */
        if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
            rcu_preempt_ctrlblk.exp_tasks == NULL) {
-                local_irq_restore(flags);
+                raw_local_irq_restore(flags);
                return 0;
        }
@@ -317,12 +301,13 @@ static int rcu_boost(void)
        t = container_of(tb, struct task_struct, rcu_node_entry);
        rt_mutex_init_proxy_locked(&mtx, t);
        t->rcu_boost_mutex = &mtx;
-        local_irq_restore(flags);
+        t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+        raw_local_irq_restore(flags);
        rt_mutex_lock(&mtx);
        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
-        return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
+        return rcu_preempt_ctrlblk.boost_tasks != NULL ||
-               ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
+               rcu_preempt_ctrlblk.exp_tasks != NULL;
 }
 /*
@@ -349,10 +334,9 @@ static int rcu_initiate_boost(void)
                if (rcu_preempt_ctrlblk.exp_tasks == NULL)
                        rcu_preempt_ctrlblk.boost_tasks =
                                rcu_preempt_ctrlblk.gp_tasks;
-                invoke_rcu_callbacks();
+                invoke_rcu_kthread();
-        } else {
+        } else
                RCU_TRACE(rcu_initiate_boost_trace());
-        }
        return 1;
 }
@@ -369,6 +353,14 @@ static void rcu_preempt_boost_start_gp(void)
 #else /* #ifdef CONFIG_RCU_BOOST */
 /*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+        return 0;
+}
+/*
 * If there is no RCU priority boosting, we don't initiate boosting,
 * but we do indicate whether there are blocked readers blocking the
 * current grace period.
@@ -435,7 +427,7 @@ static void rcu_preempt_cpu_qs(void)
        /* If there are done callbacks, cause them to be invoked. */
        if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-                invoke_rcu_callbacks();
+                invoke_rcu_kthread();
 }
 /*
@@ -485,7 +477,7 @@ void rcu_preempt_note_context_switch(void)
        unsigned long flags;
        local_irq_save(flags); /* must exclude scheduler_tick(). */
-        if (rcu_preempt_running_reader() > 0 &&
+        if (rcu_preempt_running_reader() &&
            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
                /* Possibly blocking in an RCU read-side critical section. */
@@ -504,13 +496,6 @@ void rcu_preempt_note_context_switch(void)
                list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
                if (rcu_cpu_blocking_cur_gp())
                        rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
-        } else if (rcu_preempt_running_reader() < 0 &&
-                   t->rcu_read_unlock_special) {
-                /*
-                 * Complete exit from RCU read-side critical section on
-                 * behalf of preempted instance of __rcu_read_unlock().
-                 */
-                rcu_read_unlock_special(t);
        }
        /*
@@ -527,19 +512,28 @@ void rcu_preempt_note_context_switch(void)
 }
 /*
+ * Tiny-preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+        current->rcu_read_lock_nesting++;
+        barrier();  /* needed if we ever invoke rcu_read_lock in rcutiny.c */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+/*
 * Handle special cases during rcu_read_unlock(), such as needing to
 * notify RCU core processing or task having blocked during the RCU
 * read-side critical section.
 */
-void rcu_read_unlock_special(struct task_struct *t)
+static void rcu_read_unlock_special(struct task_struct *t)
 {
        int empty;
        int empty_exp;
        unsigned long flags;
        struct list_head *np;
-#ifdef CONFIG_RCU_BOOST
-        struct rt_mutex *rbmp = NULL;
-#endif /* #ifdef CONFIG_RCU_BOOST */
        int special;
        /*
@@ -560,7 +554,7 @@ void rcu_read_unlock_special(struct task_struct *t)
                rcu_preempt_cpu_qs();
        /* Hardware IRQ handlers cannot block. */
-        if (in_irq() || in_serving_softirq()) {
+        if (in_irq()) {
                local_irq_restore(flags);
                return;
        }
@@ -605,16 +599,39 @@ void rcu_read_unlock_special(struct task_struct *t)
        }
 #ifdef CONFIG_RCU_BOOST
        /* Unboost self if was boosted. */
-        if (t->rcu_boost_mutex != NULL) {
+        if (special & RCU_READ_UNLOCK_BOOSTED) {
-                rbmp = t->rcu_boost_mutex;
+                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+                rt_mutex_unlock(t->rcu_boost_mutex);
                t->rcu_boost_mutex = NULL;
-                rt_mutex_unlock(rbmp);
        }
 #endif /* #ifdef CONFIG_RCU_BOOST */
        local_irq_restore(flags);
 }
 /*
+ * Tiny-preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+        struct task_struct *t = current;
+        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
+        --t->rcu_read_lock_nesting;
+        barrier();  /* decrement before load of ->rcu_read_unlock_special */
+        if (t->rcu_read_lock_nesting == 0 &&
+            unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                rcu_read_unlock_special(t);
+#ifdef CONFIG_PROVE_LOCKING
+        WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+/*
 * Check for a quiescent state from the current CPU.  When a task blocks,
 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
 * checked elsewhere.  This is called from the scheduling-clock interrupt.
@@ -631,10 +648,10 @@ static void rcu_preempt_check_callbacks(void)
                rcu_preempt_cpu_qs();
        if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
            rcu_preempt_ctrlblk.rcb.donetail)
-                invoke_rcu_callbacks();
+                invoke_rcu_kthread();
        if (rcu_preempt_gp_in_progress() &&
            rcu_cpu_blocking_cur_gp() &&
-            rcu_preempt_running_reader() > 0)
+            rcu_preempt_running_reader())
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
 }
@@ -657,7 +674,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
 */
 static void rcu_preempt_process_callbacks(void)
 {
-        __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+        rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
 }
 /*
@@ -680,6 +697,20 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu);
+void rcu_barrier(void)
+{
+        struct rcu_synchronize rcu;
+        init_rcu_head_on_stack(&rcu.head);
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        call_rcu(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
 * synchronize_rcu - wait until a grace period has elapsed.
 *
@@ -691,11 +722,6 @@ EXPORT_SYMBOL_GPL(call_rcu);
 */
 void synchronize_rcu(void)
 {
-        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-                           !lock_is_held(&rcu_lock_map) &&
-                           !lock_is_held(&rcu_sched_lock_map),
-                           "Illegal synchronize_rcu() in RCU read-side critical section");
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        if (!rcu_scheduler_active)
                return;
@@ -706,10 +732,7 @@ void synchronize_rcu(void)
                return;
        /* Once we get past the fastpath checks, same code as rcu_barrier(). */
-        if (rcu_expedited)
+        rcu_barrier();
-                synchronize_rcu_expedited();
-        else
-                rcu_barrier();
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
@@ -782,9 +805,9 @@ void synchronize_rcu_expedited(void)
                rpcp->exp_tasks = NULL;
        /* Wait for tail of ->blkd_tasks list to drain. */
-        if (!rcu_preempted_readers_exp()) {
+        if (!rcu_preempted_readers_exp())
                local_irq_restore(flags);
-        } else {
+        else {
                rcu_initiate_boost();
                local_irq_restore(flags);
                wait_event(sync_rcu_preempt_exp_wq,
@@ -805,9 +828,27 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 */
 int rcu_preempt_needs_cpu(void)
 {
+        if (!rcu_preempt_running_reader())
+                rcu_preempt_cpu_qs();
        return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
 }
+/*
+ * Check for a task exiting while in a preemptible -RCU read-side
+ * critical section, clean up if so.  No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+        struct task_struct *t = current;
+        if (t->rcu_read_lock_nesting == 0)
+                return;
+        t->rcu_read_lock_nesting = 1;
+        __rcu_read_unlock();
+}
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
 #ifdef CONFIG_RCU_TRACE
@@ -823,6 +864,15 @@ static void show_tiny_preempt_stats(struct seq_file *m)
 #endif /* #ifdef CONFIG_RCU_TRACE */
 /*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+        return 0;
+}
+/*
 * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
 */
@@ -848,112 +898,6 @@ static void rcu_preempt_process_callbacks(void)
 #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
-#ifdef CONFIG_RCU_BOOST
-/*
- * Wake up rcu_kthread() to process callbacks now eligible for invocation
- * or to boost readers.
- */
-static void invoke_rcu_callbacks(void)
-{
-        have_rcu_kthread_work = 1;
-        if (rcu_kthread_task != NULL)
-                wake_up(&rcu_kthread_wq);
-}
-#ifdef CONFIG_RCU_TRACE
-/*
- * Is the current CPU running the RCU-callbacks kthread?
- * Caller must have preemption disabled.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
-        return rcu_kthread_task == current;
-}
-#endif /* #ifdef CONFIG_RCU_TRACE */
-/*
- * This kthread invokes RCU callbacks whose grace periods have
- * elapsed.  It is awakened as needed, and takes the place of the
- * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
- * This is a kthread, but it is never stopped, at least not until
- * the system goes down.
- */
-static int rcu_kthread(void *arg)
-{
-        unsigned long work;
-        unsigned long morework;
-        unsigned long flags;
-        for (;;) {
-                wait_event_interruptible(rcu_kthread_wq,
-                                         have_rcu_kthread_work != 0);
-                morework = rcu_boost();
-                local_irq_save(flags);
-                work = have_rcu_kthread_work;
-                have_rcu_kthread_work = morework;
-                local_irq_restore(flags);
-                if (work)
-                        rcu_process_callbacks(NULL);
-                schedule_timeout_interruptible(1); /* Leave CPU for others. */
-        }
-        return 0;  /* Not reached, but needed to shut gcc up. */
-}
-/*
- * Spawn the kthread that invokes RCU callbacks.
- */
-static int __init rcu_spawn_kthreads(void)
-{
-        struct sched_param sp;
-        rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
-        sp.sched_priority = RCU_BOOST_PRIO;
-        sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
-        return 0;
-}
-early_initcall(rcu_spawn_kthreads);
-#else /* #ifdef CONFIG_RCU_BOOST */
-/* Hold off callback invocation until early_initcall() time. */
-static int rcu_scheduler_fully_active __read_mostly;
-/*
- * Start up softirq processing of callbacks.
- */
-void invoke_rcu_callbacks(void)
-{
-        if (rcu_scheduler_fully_active)
-                raise_softirq(RCU_SOFTIRQ);
-}
-#ifdef CONFIG_RCU_TRACE
-/*
- * There is no callback kthread, so this thread is never it.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
-        return false;
-}
-#endif /* #ifdef CONFIG_RCU_TRACE */
-static int __init rcu_scheduler_really_started(void)
-{
-        rcu_scheduler_fully_active = 1;
-        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-        raise_softirq(RCU_SOFTIRQ);  /* Invoke any callbacks from early boot. */
-        return 0;
-}
-early_initcall(rcu_scheduler_really_started);
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #include <linux/kernel_stat.h>
@@ -969,6 +913,12 @@ void __init rcu_scheduler_starting(void)
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
 #ifdef CONFIG_RCU_TRACE
 #ifdef CONFIG_RCU_BOOST
@@ -994,9 +944,9 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
 {
        unsigned long flags;
-        local_irq_save(flags);
+        raw_local_irq_save(flags);
        rcp->qlen -= n;
-        local_irq_restore(flags);
+        raw_local_irq_restore(flags);
 }
 /*
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 31dea01c85f..98f51b13bb7 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -49,27 +49,21 @@
 #include <asm/byteorder.h>
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
+              "Josh Triplett <josh@freedesktop.org>");
 static int nreaders = -1;       /* # reader threads, defaults to 2*ncpus */
 static int nfakewriters = 4;    /* # fake writer threads */
-static int stat_interval = 60;  /* Interval between stats, in seconds. */
+static int stat_interval;       /* Interval between stats, in seconds. */
-                                /*  Zero means "only at end of test". */
+                                /*  Defaults to "only at end of test". */
-static bool verbose;            /* Print more debug info. */
+static int verbose;             /* Print more debug info. */
-static bool test_no_idle_hz = true;
+static int test_no_idle_hz;     /* Test RCU's support for tickless idle CPUs. */
-                                /* Test RCU support for tickless idle CPUs. */
 static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
 static int stutter = 5;         /* Start/stop testing interval (in sec) */
 static int irqreader = 1;       /* RCU readers from irq (timers). */
-static int fqs_duration;        /* Duration of bursts (us), 0 to disable. */
+static int fqs_duration = 0;    /* Duration of bursts (us), 0 to disable. */
-static int fqs_holdoff;         /* Hold time within burst (us). */
+static int fqs_holdoff = 0;     /* Hold time within burst (us). */
 static int fqs_stutter = 3;     /* Wait time between bursts (s). */
-static int n_barrier_cbs;       /* Number of callbacks to test RCU barriers. */
-static int onoff_interval;      /* Wait time between CPU hotplugs, 0=disable. */
-static int onoff_holdoff;       /* Seconds after boot before CPU hotplugs. */
-static int shutdown_secs;       /* Shutdown time (s).  <=0 for no shutdown. */
-static int stall_cpu;           /* CPU-stall duration (s).  0 for no stall. */
-static int stall_cpu_holdoff = 10; /* Time to wait until stall (s).  */
 static int test_boost = 1;      /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
 static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
 static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -79,7 +73,7 @@ module_param(nreaders, int, 0444);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
 module_param(nfakewriters, int, 0444);
 MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-module_param(stat_interval, int, 0644);
+module_param(stat_interval, int, 0444);
 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
 module_param(verbose, bool, 0444);
 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
@@ -97,18 +91,6 @@ module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
-module_param(n_barrier_cbs, int, 0444);
-MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
-module_param(onoff_interval, int, 0444);
-MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
-module_param(onoff_holdoff, int, 0444);
-MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
-module_param(shutdown_secs, int, 0444);
-MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
-module_param(stall_cpu, int, 0444);
-MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
-module_param(stall_cpu_holdoff, int, 0444);
-MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
 module_param(test_boost, int, 0444);
 MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
 module_param(test_boost_interval, int, 0444);
@@ -120,11 +102,11 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
 #define TORTURE_FLAG "-torture:"
 #define PRINTK_STRING(s) \
-        do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
+        do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
 #define VERBOSE_PRINTK_STRING(s) \
-        do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
+        do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
 #define VERBOSE_PRINTK_ERRSTRING(s) \
-        do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
+        do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
 static char printk_buf[4096];
@@ -137,13 +119,6 @@ static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
 static struct task_struct *boost_tasks[NR_CPUS];
-static struct task_struct *shutdown_task;
-#ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct *onoff_task;
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static struct task_struct *stall_task;
-static struct task_struct **barrier_cbs_tasks;
-static struct task_struct *barrier_task;
 #define RCU_TORTURE_PIPE_LEN 10
@@ -169,24 +144,11 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
-static long n_rcu_torture_barrier_error;
 static long n_rcu_torture_boost_ktrerror;
 static long n_rcu_torture_boost_rterror;
 static long n_rcu_torture_boost_failure;
 static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
-static long n_offline_attempts;
-static long n_offline_successes;
-static unsigned long sum_offline;
-static int min_offline = -1;
-static int max_offline;
-static long n_online_attempts;
-static long n_online_successes;
-static unsigned long sum_online;
-static int min_online = -1;
-static int max_online;
-static long n_barrier_attempts;
-static long n_barrier_successes;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
@@ -198,8 +160,6 @@ static int stutter_pause_test;
 #define RCUTORTURE_RUNNABLE_INIT 0
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
-module_param(rcutorture_runnable, int, 0444);
-MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
 #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
 #define rcu_can_boost() 1
@@ -207,15 +167,9 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
 #define rcu_can_boost() 0
 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
-static unsigned long shutdown_time;     /* jiffies to system shutdown. */
 static unsigned long boost_starttime;   /* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
                                        /*  and boost task create/destroy. */
-static atomic_t barrier_cbs_count;      /* Barrier callbacks registered. */
-static bool barrier_phase;              /* Test phase. */
-static atomic_t barrier_cbs_invoked;    /* Barrier callbacks invoked. */
-static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
-static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
 /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
@@ -228,9 +182,6 @@ static int fullstop = FULLSTOP_RMMOD;
 */
 static DEFINE_MUTEX(fullstop_mutex);
-/* Forward reference. */
-static void rcu_torture_cleanup(void);
 /*
 * Detect and respond to a system shutdown.
 */
@@ -242,7 +193,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
        if (fullstop == FULLSTOP_DONTSTOP)
                fullstop = FULLSTOP_SHUTDOWN;
        else
-                pr_warn(/* but going down anyway, so... */
+                printk(KERN_WARNING /* but going down anyway, so... */
                       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
        mutex_unlock(&fullstop_mutex);
        return NOTIFY_DONE;
@@ -255,7 +206,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
 static void rcutorture_shutdown_absorb(char *title)
 {
        if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
-                pr_notice(
+                printk(KERN_NOTICE
                       "rcutorture thread %s parking due to system shutdown\n",
                       title);
                schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
@@ -339,13 +290,13 @@ rcu_stutter_wait(char *title)
 struct rcu_torture_ops {
        void (*init)(void);
+        void (*cleanup)(void);
        int (*readlock)(void);
        void (*read_delay)(struct rcu_random_state *rrsp);
        void (*readunlock)(int idx);
        int (*completed)(void);
        void (*deferred_free)(struct rcu_torture *p);
        void (*sync)(void);
-        void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
        void (*cb_barrier)(void);
        void (*fqs)(void);
        int (*stats)(char *page);
@@ -413,9 +364,8 @@ rcu_torture_cb(struct rcu_head *p)
        if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
                rp->rtort_mbtest = 0;
                rcu_torture_free(rp);
-        } else {
+        } else
                cur_ops->deferred_free(rp);
-        }
 }
 static int rcu_no_completed(void)
@@ -430,13 +380,13 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
 static struct rcu_torture_ops rcu_ops = {
        .init           = NULL,
+        .cleanup        = NULL,
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,
        .readunlock     = rcu_torture_read_unlock,
        .completed      = rcu_torture_completed,
        .deferred_free  = rcu_torture_deferred_free,
        .sync           = synchronize_rcu,
-        .call           = call_rcu,
        .cb_barrier     = rcu_barrier,
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
@@ -473,13 +423,13 @@ static void rcu_sync_torture_init(void)
 static struct rcu_torture_ops rcu_sync_ops = {
        .init           = rcu_sync_torture_init,
+        .cleanup        = NULL,
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,
        .readunlock     = rcu_torture_read_unlock,
        .completed      = rcu_torture_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_rcu,
-        .call           = NULL,
        .cb_barrier     = NULL,
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
@@ -490,13 +440,13 @@ static struct rcu_torture_ops rcu_sync_ops = {
 static struct rcu_torture_ops rcu_expedited_ops = {
        .init           = rcu_sync_torture_init,
+        .cleanup        = NULL,
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_torture_read_unlock,
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_rcu_expedited,
-        .call           = NULL,
        .cb_barrier     = NULL,
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
@@ -530,15 +480,39 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
        call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
 }
+struct rcu_bh_torture_synchronize {
+        struct rcu_head head;
+        struct completion completion;
+};
+static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
+{
+        struct rcu_bh_torture_synchronize *rcu;
+        rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
+        complete(&rcu->completion);
+}
+static void rcu_bh_torture_synchronize(void)
+{
+        struct rcu_bh_torture_synchronize rcu;
+        init_rcu_head_on_stack(&rcu.head);
+        init_completion(&rcu.completion);
+        call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
+        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
+}
 static struct rcu_torture_ops rcu_bh_ops = {
        .init           = NULL,
+        .cleanup        = NULL,
        .readlock       = rcu_bh_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_bh_torture_read_unlock,
        .completed      = rcu_bh_torture_completed,
        .deferred_free  = rcu_bh_torture_deferred_free,
-        .sync           = synchronize_rcu_bh,
+        .sync           = rcu_bh_torture_synchronize,
-        .call           = call_rcu_bh,
        .cb_barrier     = rcu_barrier_bh,
        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
@@ -548,13 +522,13 @@ static struct rcu_torture_ops rcu_bh_ops = {
 static struct rcu_torture_ops rcu_bh_sync_ops = {
        .init           = rcu_sync_torture_init,
+        .cleanup        = NULL,
        .readlock       = rcu_bh_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_bh_torture_read_unlock,
        .completed      = rcu_bh_torture_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = synchronize_rcu_bh,
+        .sync           = rcu_bh_torture_synchronize,
-        .call           = NULL,
        .cb_barrier     = NULL,
        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
@@ -562,27 +536,23 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
        .name           = "rcu_bh_sync"
 };
-static struct rcu_torture_ops rcu_bh_expedited_ops = {
-        .init           = rcu_sync_torture_init,
-        .readlock       = rcu_bh_torture_read_lock,
-        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
-        .readunlock     = rcu_bh_torture_read_unlock,
-        .completed      = rcu_bh_torture_completed,
-        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = synchronize_rcu_bh_expedited,
-        .call           = NULL,
-        .cb_barrier     = NULL,
-        .fqs            = rcu_bh_force_quiescent_state,
-        .stats          = NULL,
-        .irq_capable    = 1,
-        .name           = "rcu_bh_expedited"
-};
 /*
 * Definitions for srcu torture testing.
 */
-DEFINE_STATIC_SRCU(srcu_ctl);
+static struct srcu_struct srcu_ctl;
+static void srcu_torture_init(void)
+{
+        init_srcu_struct(&srcu_ctl);
+        rcu_sync_torture_init();
+}
+static void srcu_torture_cleanup(void)
+{
+        synchronize_srcu(&srcu_ctl);
+        cleanup_srcu_struct(&srcu_ctl);
+}
 static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
 {
@@ -614,27 +584,11 @@ static int srcu_torture_completed(void)
        return srcu_batches_completed(&srcu_ctl);
 }
-static void srcu_torture_deferred_free(struct rcu_torture *rp)
-{
-        call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
-}
 static void srcu_torture_synchronize(void)
 {
        synchronize_srcu(&srcu_ctl);
 }
-static void srcu_torture_call(struct rcu_head *head,
-                              void (*func)(struct rcu_head *head))
-{
-        call_srcu(&srcu_ctl, head, func);
-}
-static void srcu_torture_barrier(void)
-{
-        srcu_barrier(&srcu_ctl);
-}
 static int srcu_torture_stats(char *page)
 {
        int cnt = 0;
@@ -644,7 +598,7 @@ static int srcu_torture_stats(char *page)
        cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
                       torture_type, TORTURE_FLAG, idx);
        for_each_possible_cpu(cpu) {
-                cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,
+                cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu,
                               per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
                               per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
        }
@@ -653,69 +607,17 @@ static int srcu_torture_stats(char *page)
 }
 static struct rcu_torture_ops srcu_ops = {
-        .init           = rcu_sync_torture_init,
+        .init           = srcu_torture_init,
+        .cleanup        = srcu_torture_cleanup,
        .readlock       = srcu_torture_read_lock,
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock,
        .completed      = srcu_torture_completed,
-        .deferred_free  = srcu_torture_deferred_free,
-        .sync           = srcu_torture_synchronize,
-        .call           = srcu_torture_call,
-        .cb_barrier     = srcu_torture_barrier,
-        .stats          = srcu_torture_stats,
-        .name           = "srcu"
-};
-static struct rcu_torture_ops srcu_sync_ops = {
-        .init           = rcu_sync_torture_init,
-        .readlock       = srcu_torture_read_lock,
-        .read_delay     = srcu_read_delay,
-        .readunlock     = srcu_torture_read_unlock,
-        .completed      = srcu_torture_completed,
-        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = srcu_torture_synchronize,
-        .call           = NULL,
-        .cb_barrier     = NULL,
-        .stats          = srcu_torture_stats,
-        .name           = "srcu_sync"
-};
-static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
-{
-        return srcu_read_lock_raw(&srcu_ctl);
-}
-static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
-{
-        srcu_read_unlock_raw(&srcu_ctl, idx);
-}
-static struct rcu_torture_ops srcu_raw_ops = {
-        .init           = rcu_sync_torture_init,
-        .readlock       = srcu_torture_read_lock_raw,
-        .read_delay     = srcu_read_delay,
-        .readunlock     = srcu_torture_read_unlock_raw,
-        .completed      = srcu_torture_completed,
-        .deferred_free  = srcu_torture_deferred_free,
-        .sync           = srcu_torture_synchronize,
-        .call           = NULL,
-        .cb_barrier     = NULL,
-        .stats          = srcu_torture_stats,
-        .name           = "srcu_raw"
-};
-static struct rcu_torture_ops srcu_raw_sync_ops = {
-        .init           = rcu_sync_torture_init,
-        .readlock       = srcu_torture_read_lock_raw,
-        .read_delay     = srcu_read_delay,
-        .readunlock     = srcu_torture_read_unlock_raw,
-        .completed      = srcu_torture_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = srcu_torture_synchronize,
-        .call           = NULL,
        .cb_barrier     = NULL,
        .stats          = srcu_torture_stats,
-        .name           = "srcu_raw_sync"
+        .name           = "srcu"
 };
 static void srcu_torture_synchronize_expedited(void)
@@ -724,14 +626,14 @@ static void srcu_torture_synchronize_expedited(void)
 }
 static struct rcu_torture_ops srcu_expedited_ops = {
-        .init           = rcu_sync_torture_init,
+        .init           = srcu_torture_init,
+        .cleanup        = srcu_torture_cleanup,
        .readlock       = srcu_torture_read_lock,
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock,
        .completed      = srcu_torture_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = srcu_torture_synchronize_expedited,
-        .call           = NULL,
        .cb_barrier     = NULL,
        .stats          = srcu_torture_stats,
        .name           = "srcu_expedited"
@@ -757,14 +659,20 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
        call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
 }
+static void sched_torture_synchronize(void)
+{
+        synchronize_sched();
+}
 static struct rcu_torture_ops sched_ops = {
        .init           = rcu_sync_torture_init,
+        .cleanup        = NULL,
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_sched_torture_deferred_free,
-        .sync           = synchronize_sched,
+        .sync           = sched_torture_synchronize,
        .cb_barrier     = rcu_barrier_sched,
        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = NULL,
@@ -774,12 +682,13 @@ static struct rcu_torture_ops sched_ops = {
 static struct rcu_torture_ops sched_sync_ops = {
        .init           = rcu_sync_torture_init,
+        .cleanup        = NULL,
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = synchronize_sched,
+        .sync           = sched_torture_synchronize,
        .cb_barrier     = NULL,
        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = NULL,
@@ -788,6 +697,7 @@ static struct rcu_torture_ops sched_sync_ops = {
 static struct rcu_torture_ops sched_expedited_ops = {
        .init           = rcu_sync_torture_init,
+        .cleanup        = NULL,
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
@@ -844,7 +754,7 @@ static int rcu_torture_boost(void *arg)
        do {
                /* Wait for the next test interval. */
                oldstarttime = boost_starttime;
-                while (ULONG_CMP_LT(jiffies, oldstarttime)) {
+                while (jiffies - oldstarttime > ULONG_MAX / 2) {
                        schedule_timeout_uninterruptible(1);
                        rcu_stutter_wait("rcu_torture_boost");
                        if (kthread_should_stop() ||
@@ -855,7 +765,7 @@ static int rcu_torture_boost(void *arg)
                /* Do one boost-test interval. */
                endtime = oldstarttime + test_boost_duration * HZ;
                call_rcu_time = jiffies;
-                while (ULONG_CMP_LT(jiffies, endtime)) {
+                while (jiffies - endtime > ULONG_MAX / 2) {
                        /* If we don't have a callback in flight, post one. */
                        if (!rbi.inflight) {
                                smp_mb(); /* RCU core before ->inflight = 1. */
@@ -882,8 +792,7 @@ static int rcu_torture_boost(void *arg)
                 * interval.  Besides, we are running at RT priority,
                 * so delays should be relatively rare.
                 */
-                while (oldstarttime == boost_starttime &&
+                while (oldstarttime == boost_starttime) {
-                       !kthread_should_stop()) {
                        if (mutex_trylock(&boost_mutex)) {
                                boost_starttime = jiffies +
                                                  test_boost_interval * HZ;
@@ -900,11 +809,11 @@ checkwait:	rcu_stutter_wait("rcu_torture_boost");
        /* Clean up and exit. */
        VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+        destroy_rcu_head_on_stack(&rbi.rcu);
        rcutorture_shutdown_absorb("rcu_torture_boost");
        while (!kthread_should_stop() || rbi.inflight)
                schedule_timeout_uninterruptible(1);
        smp_mb(); /* order accesses to ->inflight before stack-frame death. */
-        destroy_rcu_head_on_stack(&rbi.rcu);
        return 0;
 }
@@ -922,13 +831,11 @@ rcu_torture_fqs(void *arg)
        VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
        do {
                fqs_resume_time = jiffies + fqs_stutter * HZ;
-                while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
+                while (jiffies - fqs_resume_time > LONG_MAX) {
-                       !kthread_should_stop()) {
                        schedule_timeout_interruptible(1);
                }
                fqs_burst_remaining = fqs_duration;
-                while (fqs_burst_remaining > 0 &&
+                while (fqs_burst_remaining > 0) {
-                       !kthread_should_stop()) {
                        cur_ops->fqs();
                        udelay(fqs_holdoff);
                        fqs_burst_remaining -= fqs_holdoff;
@@ -1005,11 +912,7 @@ rcu_torture_fakewriter(void *arg)
        do {
                schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
                udelay(rcu_random(&rand) & 0x3ff);
-                if (cur_ops->cb_barrier != NULL &&
+                cur_ops->sync();
-                    rcu_random(&rand) % (nfakewriters * 8) == 0)
-                        cur_ops->cb_barrier();
-                else
-                        cur_ops->sync();
                rcu_stutter_wait("rcu_torture_fakewriter");
        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -1020,18 +923,6 @@ rcu_torture_fakewriter(void *arg)
        return 0;
 }
-void rcutorture_trace_dump(void)
-{
-        static atomic_t beenhere = ATOMIC_INIT(0);
-        if (atomic_read(&beenhere))
-                return;
-        if (atomic_xchg(&beenhere, 1) != 0)
-                return;
-        do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
-        ftrace_dump(DUMP_ALL);
-}
 /*
 * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
 * incrementing the corresponding element of the pipeline array.  The
@@ -1058,7 +949,6 @@ static void rcu_torture_timer(unsigned long unused)
                cur_ops->readunlock(idx);
                return;
        }
-        do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
        if (p->rtort_mbtest == 0)
                atomic_inc(&n_rcu_torture_mberror);
        spin_lock(&rand_lock);
@@ -1071,8 +961,6 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
-        if (pipe_count > 1)
-                rcutorture_trace_dump();
        __this_cpu_inc(rcu_torture_count[pipe_count]);
        completed = cur_ops->completed() - completed;
        if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1122,7 +1010,6 @@ rcu_torture_reader(void *arg)
                        schedule_timeout_interruptible(HZ);
                        continue;
                }
-                do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
                if (p->rtort_mbtest == 0)
                        atomic_inc(&n_rcu_torture_mberror);
                cur_ops->read_delay(&rand);
@@ -1132,8 +1019,6 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
-                if (pipe_count > 1)
-                        rcutorture_trace_dump();
                __this_cpu_inc(rcu_torture_count[pipe_count]);
                completed = cur_ops->completed() - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1179,39 +1064,28 @@ rcu_torture_printk(char *page)
        }
        cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
        cnt += sprintf(&page[cnt],
-                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
+                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
+                       "rtmbe: %d rtbke: %ld rtbre: %ld "
+                       "rtbf: %ld rtb: %ld nt: %ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
                       list_empty(&rcu_torture_freelist),
                       atomic_read(&n_rcu_torture_alloc),
                       atomic_read(&n_rcu_torture_alloc_fail),
-                       atomic_read(&n_rcu_torture_free));
+                       atomic_read(&n_rcu_torture_free),
-        cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ",
                       atomic_read(&n_rcu_torture_mberror),
                       n_rcu_torture_boost_ktrerror,
-                       n_rcu_torture_boost_rterror);
+                       n_rcu_torture_boost_rterror,
-        cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ",
                       n_rcu_torture_boost_failure,
                       n_rcu_torture_boosts,
                       n_rcu_torture_timers);
-        cnt += sprintf(&page[cnt],
-                       "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
-                       n_online_successes, n_online_attempts,
-                       n_offline_successes, n_offline_attempts,
-                       min_online, max_online,
-                       min_offline, max_offline,
-                       sum_online, sum_offline, HZ);
-        cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
-                       n_barrier_successes,
-                       n_barrier_attempts,
-                       n_rcu_torture_barrier_error);
-        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
        if (atomic_read(&n_rcu_torture_mberror) != 0 ||
-            n_rcu_torture_barrier_error != 0 ||
            n_rcu_torture_boost_ktrerror != 0 ||
            n_rcu_torture_boost_rterror != 0 ||
-            n_rcu_torture_boost_failure != 0 ||
+            n_rcu_torture_boost_failure != 0)
-            i > 1) {
+                cnt += sprintf(&page[cnt], " !!!");
+        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
+        if (i > 1) {
                cnt += sprintf(&page[cnt], "!!! ");
                atomic_inc(&n_rcu_torture_error);
                WARN_ON_ONCE(1);
@@ -1249,7 +1123,7 @@ rcu_torture_stats_print(void)
        int cnt;
        cnt = rcu_torture_printk(printk_buf);
-        pr_alert("%s", printk_buf);
+        printk(KERN_ALERT "%s", printk_buf);
 }
 /*
@@ -1362,24 +1236,18 @@ rcu_torture_stutter(void *arg)
 static inline void
 rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
 {
-        pr_alert("%s" TORTURE_FLAG
+        printk(KERN_ALERT "%s" TORTURE_FLAG
-                 "--- %s: nreaders=%d nfakewriters=%d "
+                "--- %s: nreaders=%d nfakewriters=%d "
-                 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
+                "stat_interval=%d verbose=%d test_no_idle_hz=%d "
-                 "shuffle_interval=%d stutter=%d irqreader=%d "
+                "shuffle_interval=%d stutter=%d irqreader=%d "
-                 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
-                 "test_boost=%d/%d test_boost_interval=%d "
+                "test_boost=%d/%d test_boost_interval=%d "
-                 "test_boost_duration=%d shutdown_secs=%d "
+                "test_boost_duration=%d\n",
-                 "stall_cpu=%d stall_cpu_holdoff=%d "
+                torture_type, tag, nrealreaders, nfakewriters,
-                 "n_barrier_cbs=%d "
+                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-                 "onoff_interval=%d onoff_holdoff=%d\n",
+                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
-                 torture_type, tag, nrealreaders, nfakewriters,
+                test_boost, cur_ops->can_boost,
-                 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
+                test_boost_interval, test_boost_duration);
-                 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
-                 test_boost, cur_ops->can_boost,
-                 test_boost_interval, test_boost_duration, shutdown_secs,
-                 stall_cpu, stall_cpu_holdoff,
-                 n_barrier_cbs,
-                 onoff_interval, onoff_holdoff);
 }
 static struct notifier_block rcutorture_shutdown_nb = {
@@ -1400,7 +1268,6 @@ static void rcutorture_booster_cleanup(int cpu)
        /* This must be outside of the mutex, otherwise deadlock! */
        kthread_stop(t);
-        boost_tasks[cpu] = NULL;
 }
 static int rcutorture_booster_init(int cpu)
@@ -1413,9 +1280,8 @@ static int rcutorture_booster_init(int cpu)
        /* Don't allow time recalculation while creating a new task. */
        mutex_lock(&boost_mutex);
        VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
-        boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
+        boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
-                                                  cpu_to_node(cpu),
+                                          "rcu_torture_boost");
-                                                  "rcu_torture_boost");
        if (IS_ERR(boost_tasks[cpu])) {
                retval = PTR_ERR(boost_tasks[cpu]);
                VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
@@ -1430,376 +1296,6 @@ static int rcutorture_booster_init(int cpu)
        return 0;
 }
-/*
- * Cause the rcutorture test to shutdown the system after the test has
- * run for the time specified by the shutdown_secs module parameter.
- */
-static int
-rcu_torture_shutdown(void *arg)
-{
-        long delta;
-        unsigned long jiffies_snap;
-        VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
-        jiffies_snap = ACCESS_ONCE(jiffies);
-        while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
-               !kthread_should_stop()) {
-                delta = shutdown_time - jiffies_snap;
-                if (verbose)
-                        pr_alert("%s" TORTURE_FLAG
-                                 "rcu_torture_shutdown task: %lu jiffies remaining\n",
-                                 torture_type, delta);
-                schedule_timeout_interruptible(delta);
-                jiffies_snap = ACCESS_ONCE(jiffies);
-        }
-        if (kthread_should_stop()) {
-                VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
-                return 0;
-        }
-        /* OK, shut down the system. */
-        VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
-        shutdown_task = NULL;   /* Avoid self-kill deadlock. */
-        rcu_torture_cleanup();  /* Get the success/failure message. */
-        kernel_power_off();     /* Shut down the system. */
-        return 0;
-}
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Execute random CPU-hotplug operations at the interval specified
- * by the onoff_interval.
- */
-static int __cpuinit
-rcu_torture_onoff(void *arg)
-{
-        int cpu;
-        unsigned long delta;
-        int maxcpu = -1;
-        DEFINE_RCU_RANDOM(rand);
-        int ret;
-        unsigned long starttime;
-        VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
-        for_each_online_cpu(cpu)
-                maxcpu = cpu;
-        WARN_ON(maxcpu < 0);
-        if (onoff_holdoff > 0) {
-                VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
-                schedule_timeout_interruptible(onoff_holdoff * HZ);
-                VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
-        }
-        while (!kthread_should_stop()) {
-                cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
-                if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
-                        if (verbose)
-                                pr_alert("%s" TORTURE_FLAG
-                                         "rcu_torture_onoff task: offlining %d\n",
-                                         torture_type, cpu);
-                        starttime = jiffies;
-                        n_offline_attempts++;
-                        ret = cpu_down(cpu);
-                        if (ret) {
-                                if (verbose)
-                                        pr_alert("%s" TORTURE_FLAG
-                                                 "rcu_torture_onoff task: offline %d failed: errno %d\n",
-                                                 torture_type, cpu, ret);
-                        } else {
-                                if (verbose)
-                                        pr_alert("%s" TORTURE_FLAG
-                                                 "rcu_torture_onoff task: offlined %d\n",
-                                                 torture_type, cpu);
-                                n_offline_successes++;
-                                delta = jiffies - starttime;
-                                sum_offline += delta;
-                                if (min_offline < 0) {
-                                        min_offline = delta;
-                                        max_offline = delta;
-                                }
-                                if (min_offline > delta)
-                                        min_offline = delta;
-                                if (max_offline < delta)
-                                        max_offline = delta;
-                        }
-                } else if (cpu_is_hotpluggable(cpu)) {
-                        if (verbose)
-                                pr_alert("%s" TORTURE_FLAG
-                                         "rcu_torture_onoff task: onlining %d\n",
-                                         torture_type, cpu);
-                        starttime = jiffies;
-                        n_online_attempts++;
-                        if (cpu_up(cpu) == 0) {
-                                if (verbose)
-                                        pr_alert("%s" TORTURE_FLAG
-                                                 "rcu_torture_onoff task: onlined %d\n",
-                                                 torture_type, cpu);
-                                n_online_successes++;
-                                delta = jiffies - starttime;
-                                sum_online += delta;
-                                if (min_online < 0) {
-                                        min_online = delta;
-                                        max_online = delta;
-                                }
-                                if (min_online > delta)
-                                        min_online = delta;
-                                if (max_online < delta)
-                                        max_online = delta;
-                        }
-                }
-                schedule_timeout_interruptible(onoff_interval * HZ);
-        }
-        VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
-        return 0;
-}
-static int __cpuinit
-rcu_torture_onoff_init(void)
-{
-        int ret;
-        if (onoff_interval <= 0)
-                return 0;
-        onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
-        if (IS_ERR(onoff_task)) {
-                ret = PTR_ERR(onoff_task);
-                onoff_task = NULL;
-                return ret;
-        }
-        return 0;
-}
-static void rcu_torture_onoff_cleanup(void)
-{
-        if (onoff_task == NULL)
-                return;
-        VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
-        kthread_stop(onoff_task);
-        onoff_task = NULL;
-}
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-static int
-rcu_torture_onoff_init(void)
-{
-        return 0;
-}
-static void rcu_torture_onoff_cleanup(void)
-{
-}
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-/*
- * CPU-stall kthread.  It waits as specified by stall_cpu_holdoff, then
- * induces a CPU stall for the time specified by stall_cpu.
- */
-static int __cpuinit rcu_torture_stall(void *args)
-{
-        unsigned long stop_at;
-        VERBOSE_PRINTK_STRING("rcu_torture_stall task started");
-        if (stall_cpu_holdoff > 0) {
-                VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff");
-                schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
-                VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff");
-        }
-        if (!kthread_should_stop()) {
-                stop_at = get_seconds() + stall_cpu;
-                /* RCU CPU stall is expected behavior in following code. */
-                pr_alert("rcu_torture_stall start.\n");
-                rcu_read_lock();
-                preempt_disable();
-                while (ULONG_CMP_LT(get_seconds(), stop_at))
-                        continue;  /* Induce RCU CPU stall warning. */
-                preempt_enable();
-                rcu_read_unlock();
-                pr_alert("rcu_torture_stall end.\n");
-        }
-        rcutorture_shutdown_absorb("rcu_torture_stall");
-        while (!kthread_should_stop())
-                schedule_timeout_interruptible(10 * HZ);
-        return 0;
-}
-/* Spawn CPU-stall kthread, if stall_cpu specified. */
-static int __init rcu_torture_stall_init(void)
-{
-        int ret;
-        if (stall_cpu <= 0)
-                return 0;
-        stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall");
-        if (IS_ERR(stall_task)) {
-                ret = PTR_ERR(stall_task);
-                stall_task = NULL;
-                return ret;
-        }
-        return 0;
-}
-/* Clean up after the CPU-stall kthread, if one was spawned. */
-static void rcu_torture_stall_cleanup(void)
-{
-        if (stall_task == NULL)
-                return;
-        VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
-        kthread_stop(stall_task);
-        stall_task = NULL;
-}
-/* Callback function for RCU barrier testing. */
-void rcu_torture_barrier_cbf(struct rcu_head *rcu)
-{
-        atomic_inc(&barrier_cbs_invoked);
-}
-/* kthread function to register callbacks used to test RCU barriers. */
-static int rcu_torture_barrier_cbs(void *arg)
-{
-        long myid = (long)arg;
-        bool lastphase = 0;
-        struct rcu_head rcu;
-        init_rcu_head_on_stack(&rcu);
-        VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
-        set_user_nice(current, 19);
-        do {
-                wait_event(barrier_cbs_wq[myid],
-                           barrier_phase != lastphase ||
-                           kthread_should_stop() ||
-                           fullstop != FULLSTOP_DONTSTOP);
-                lastphase = barrier_phase;
-                smp_mb(); /* ensure barrier_phase load before ->call(). */
-                if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
-                        break;
-                cur_ops->call(&rcu, rcu_torture_barrier_cbf);
-                if (atomic_dec_and_test(&barrier_cbs_count))
-                        wake_up(&barrier_wq);
-        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-        VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
-        rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
-        while (!kthread_should_stop())
-                schedule_timeout_interruptible(1);
-        cur_ops->cb_barrier();
-        destroy_rcu_head_on_stack(&rcu);
-        return 0;
-}
-/* kthread function to drive and coordinate RCU barrier testing. */
-static int rcu_torture_barrier(void *arg)
-{
-        int i;
-        VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
-        do {
-                atomic_set(&barrier_cbs_invoked, 0);
-                atomic_set(&barrier_cbs_count, n_barrier_cbs);
-                smp_mb(); /* Ensure barrier_phase after prior assignments. */
-                barrier_phase = !barrier_phase;
-                for (i = 0; i < n_barrier_cbs; i++)
-                        wake_up(&barrier_cbs_wq[i]);
-                wait_event(barrier_wq,
-                           atomic_read(&barrier_cbs_count) == 0 ||
-                           kthread_should_stop() ||
-                           fullstop != FULLSTOP_DONTSTOP);
-                if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
-                        break;
-                n_barrier_attempts++;
-                cur_ops->cb_barrier();
-                if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
-                        n_rcu_torture_barrier_error++;
-                        WARN_ON_ONCE(1);
-                }
-                n_barrier_successes++;
-                schedule_timeout_interruptible(HZ / 10);
-        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-        VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
-        rcutorture_shutdown_absorb("rcu_torture_barrier");
-        while (!kthread_should_stop())
-                schedule_timeout_interruptible(1);
-        return 0;
-}
-/* Initialize RCU barrier testing. */
-static int rcu_torture_barrier_init(void)
-{
-        int i;
-        int ret;
-        if (n_barrier_cbs == 0)
-                return 0;
-        if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
-                pr_alert("%s" TORTURE_FLAG
-                         " Call or barrier ops missing for %s,\n",
-                         torture_type, cur_ops->name);
-                pr_alert("%s" TORTURE_FLAG
-                         " RCU barrier testing omitted from run.\n",
-                         torture_type);
-                return 0;
-        }
-        atomic_set(&barrier_cbs_count, 0);
-        atomic_set(&barrier_cbs_invoked, 0);
-        barrier_cbs_tasks =
-                kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
-                        GFP_KERNEL);
-        barrier_cbs_wq =
-                kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
-                        GFP_KERNEL);
-        if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
-                return -ENOMEM;
-        for (i = 0; i < n_barrier_cbs; i++) {
-                init_waitqueue_head(&barrier_cbs_wq[i]);
-                barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
-                                                   (void *)(long)i,
-                                                   "rcu_torture_barrier_cbs");
-                if (IS_ERR(barrier_cbs_tasks[i])) {
-                        ret = PTR_ERR(barrier_cbs_tasks[i]);
-                        VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
-                        barrier_cbs_tasks[i] = NULL;
-                        return ret;
-                }
-        }
-        barrier_task = kthread_run(rcu_torture_barrier, NULL,
-                                   "rcu_torture_barrier");
-        if (IS_ERR(barrier_task)) {
-                ret = PTR_ERR(barrier_task);
-                VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
-                barrier_task = NULL;
-        }
-        return 0;
-}
-/* Clean up after RCU barrier testing. */
-static void rcu_torture_barrier_cleanup(void)
-{
-        int i;
-        if (barrier_task != NULL) {
-                VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
-                kthread_stop(barrier_task);
-                barrier_task = NULL;
-        }
-        if (barrier_cbs_tasks != NULL) {
-                for (i = 0; i < n_barrier_cbs; i++) {
-                        if (barrier_cbs_tasks[i] != NULL) {
-                                VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
-                                kthread_stop(barrier_cbs_tasks[i]);
-                                barrier_cbs_tasks[i] = NULL;
-                        }
-                }
-                kfree(barrier_cbs_tasks);
-                barrier_cbs_tasks = NULL;
-        }
-        if (barrier_cbs_wq != NULL) {
-                kfree(barrier_cbs_wq);
-                barrier_cbs_wq = NULL;
-        }
-}
 static int rcutorture_cpu_notify(struct notifier_block *self,
                                 unsigned long action, void *hcpu)
 {
@@ -1831,7 +1327,7 @@ rcu_torture_cleanup(void)
        mutex_lock(&fullstop_mutex);
        rcutorture_record_test_transition();
        if (fullstop == FULLSTOP_SHUTDOWN) {
-                pr_warn(/* but going down anyway, so... */
+                printk(KERN_WARNING /* but going down anyway, so... */
                       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
                mutex_unlock(&fullstop_mutex);
                schedule_timeout_uninterruptible(10);
@@ -1842,8 +1338,6 @@ rcu_torture_cleanup(void)
        fullstop = FULLSTOP_RMMOD;
        mutex_unlock(&fullstop_mutex);
        unregister_reboot_notifier(&rcutorture_shutdown_nb);
-        rcu_torture_barrier_cleanup();
-        rcu_torture_stall_cleanup();
        if (stutter_task) {
                VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
                kthread_stop(stutter_task);
@@ -1906,12 +1400,6 @@ rcu_torture_cleanup(void)
                for_each_possible_cpu(i)
                        rcutorture_booster_cleanup(i);
        }
-        if (shutdown_task != NULL) {
-                VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
-                kthread_stop(shutdown_task);
-        }
-        shutdown_task = NULL;
-        rcu_torture_onoff_cleanup();
        /* Wait for all RCU callbacks to fire.  */
@@ -1920,12 +1408,10 @@ rcu_torture_cleanup(void)
        rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
-        if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
+        if (cur_ops->cleanup)
+                cur_ops->cleanup();
+        if (atomic_read(&n_rcu_torture_error))
                rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
-        else if (n_online_successes != n_online_attempts ||
-                 n_offline_successes != n_offline_attempts)
-                rcu_torture_print_module_parms(cur_ops,
-                                               "End of test: RCU_HOTPLUG");
        else
                rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
 }
@@ -1936,12 +1422,10 @@ rcu_torture_init(void)
        int i;
        int cpu;
        int firsterr = 0;
-        int retval;
        static struct rcu_torture_ops *torture_ops[] =
                { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
-                  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
+                  &rcu_bh_ops, &rcu_bh_sync_ops,
-                  &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
+                  &srcu_ops, &srcu_expedited_ops,
-                  &srcu_raw_ops, &srcu_raw_sync_ops,
                  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
        mutex_lock(&fullstop_mutex);
@@ -1953,17 +1437,18 @@ rcu_torture_init(void)
                        break;
        }
        if (i == ARRAY_SIZE(torture_ops)) {
-                pr_alert("rcu-torture: invalid torture type: \"%s\"\n",
+                printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
-                         torture_type);
+                       torture_type);
-                pr_alert("rcu-torture types:");
+                printk(KERN_ALERT "rcu-torture types:");
                for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
-                        pr_alert(" %s", torture_ops[i]->name);
+                        printk(KERN_ALERT " %s", torture_ops[i]->name);
-                pr_alert("\n");
+                printk(KERN_ALERT "\n");
                mutex_unlock(&fullstop_mutex);
                return -EINVAL;
        }
        if (cur_ops->fqs == NULL && fqs_duration != 0) {
-                pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
+                printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
+                                  "fqs_duration, fqs disabled.\n");
                fqs_duration = 0;
        }
        if (cur_ops->init)
@@ -1994,7 +1479,6 @@ rcu_torture_init(void)
        atomic_set(&n_rcu_torture_free, 0);
        atomic_set(&n_rcu_torture_mberror, 0);
        atomic_set(&n_rcu_torture_error, 0);
-        n_rcu_torture_barrier_error = 0;
        n_rcu_torture_boost_ktrerror = 0;
        n_rcu_torture_boost_rterror = 0;
        n_rcu_torture_boost_failure = 0;
@@ -2011,15 +1495,14 @@ rcu_torture_init(void)
        /* Start up the kthreads. */
        VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
-        writer_task = kthread_create(rcu_torture_writer, NULL,
+        writer_task = kthread_run(rcu_torture_writer, NULL,
-                                     "rcu_torture_writer");
+                                  "rcu_torture_writer");
        if (IS_ERR(writer_task)) {
                firsterr = PTR_ERR(writer_task);
                VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
                writer_task = NULL;
                goto unwind;
        }
-        wake_up_process(writer_task);
        fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
                                   GFP_KERNEL);
        if (fakewriter_tasks == NULL) {
@@ -2119,6 +1602,7 @@ rcu_torture_init(void)
                test_boost_duration = 2;
        if ((test_boost == 1 && cur_ops->can_boost) ||
            test_boost == 2) {
+                int retval;
                boost_starttime = jiffies + test_boost_interval * HZ;
                register_cpu_notifier(&rcutorture_cpu_nb);
@@ -2132,34 +1616,7 @@ rcu_torture_init(void)
                        }
                }
        }
-        if (shutdown_secs > 0) {
-                shutdown_time = jiffies + shutdown_secs * HZ;
-                shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
-                                               "rcu_torture_shutdown");
-                if (IS_ERR(shutdown_task)) {
-                        firsterr = PTR_ERR(shutdown_task);
-                        VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
-                        shutdown_task = NULL;
-                        goto unwind;
-                }
-                wake_up_process(shutdown_task);
-        }
-        i = rcu_torture_onoff_init();
-        if (i != 0) {
-                firsterr = i;
-                goto unwind;
-        }
        register_reboot_notifier(&rcutorture_shutdown_nb);
-        i = rcu_torture_stall_init();
-        if (i != 0) {
-                firsterr = i;
-                goto unwind;
-        }
-        retval = rcu_torture_barrier_init();
-        if (retval != 0) {
-                firsterr = retval;
-                goto unwind;
-        }
        rcutorture_record_test_transition();
        mutex_unlock(&fullstop_mutex);
        return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e441b77b614..ba06207b1dd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -38,7 +38,7 @@
 #include <linux/nmi.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
@@ -50,56 +50,39 @@
 #include <linux/wait.h>
 #include <linux/kthread.h>
 #include <linux/prefetch.h>
-#include <linux/delay.h>
-#include <linux/stop_machine.h>
-#include <linux/random.h>
 #include "rcutree.h"
-#include <trace/events/rcu.h>
-#include "rcu.h"
 /* Data structures. */
-static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
-static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+#define RCU_STATE_INITIALIZER(structname) { \
-#define RCU_STATE_INITIALIZER(sname, cr) { \
+        .level = { &structname.node[0] }, \
-        .level = { &sname##_state.node[0] }, \
+        .levelcnt = { \
-        .call = cr, \
+                NUM_RCU_LVL_0,  /* root of hierarchy. */ \
-        .fqs_state = RCU_GP_IDLE, \
+                NUM_RCU_LVL_1, \
-        .gpnum = 0UL - 300UL, \
+                NUM_RCU_LVL_2, \
-        .completed = 0UL - 300UL, \
+                NUM_RCU_LVL_3, \
-        .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
+                NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
-        .orphan_nxttail = &sname##_state.orphan_nxtlist, \
+        }, \
-        .orphan_donetail = &sname##_state.orphan_donelist, \
+        .signaled = RCU_GP_IDLE, \
-        .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+        .gpnum = -300, \
-        .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
+        .completed = -300, \
-        .name = #sname, \
+        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-}
+        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
+        .n_force_qs = 0, \
-struct rcu_state rcu_sched_state =
+        .n_force_qs_ngp = 0, \
-        RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
+        .name = #structname, \
+}
+struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 static struct rcu_state *rcu_state;
-LIST_HEAD(rcu_struct_flavors);
-/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
-static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
-module_param(rcu_fanout_leaf, int, 0444);
-int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
-static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */
-        NUM_RCU_LVL_0,
-        NUM_RCU_LVL_1,
-        NUM_RCU_LVL_2,
-        NUM_RCU_LVL_3,
-        NUM_RCU_LVL_4,
-};
-int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
 /*
 * The rcu_scheduler_active variable transitions from zero to one just
@@ -135,15 +118,18 @@ static int rcu_scheduler_fully_active __read_mostly;
 */
 static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
 DEFINE_PER_CPU(char, rcu_cpu_has_work);
 #endif /* #ifdef CONFIG_RCU_BOOST */
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+#define RCU_KTHREAD_PRIO 1      /* RT priority for per-CPU kthreads. */
 /*
 * Track the rcutorture test sequence number and the update version
 * number within a given test.  The rcutorture_testseq is incremented
@@ -170,67 +156,55 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
 * Note a quiescent state.  Because we do not need to know
 * how many quiescent states passed, just if there was at least
 * one since the start of the grace period, this just sets a flag.
- * The caller must have disabled preemption.
 */
 void rcu_sched_qs(int cpu)
 {
        struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
-        if (rdp->passed_quiesce == 0)
+        rdp->passed_quiesc_completed = rdp->gpnum - 1;
-                trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
+        barrier();
-        rdp->passed_quiesce = 1;
+        rdp->passed_quiesc = 1;
 }
 void rcu_bh_qs(int cpu)
 {
        struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-        if (rdp->passed_quiesce == 0)
+        rdp->passed_quiesc_completed = rdp->gpnum - 1;
-                trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
+        barrier();
-        rdp->passed_quiesce = 1;
+        rdp->passed_quiesc = 1;
 }
 /*
 * Note a context switch.  This is a quiescent state for RCU-sched,
 * and requires special handling for preemptible RCU.
- * The caller must have disabled preemption.
 */
 void rcu_note_context_switch(int cpu)
 {
-        trace_rcu_utilization("Start context switch");
        rcu_sched_qs(cpu);
        rcu_preempt_note_context_switch(cpu);
-        trace_rcu_utilization("End context switch");
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
+#ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
+        .dynticks_nesting = 1,
        .dynticks = ATOMIC_INIT(1),
 };
+#endif /* #ifdef CONFIG_NO_HZ */
-static long blimit = 10;        /* Maximum callbacks per rcu_do_batch. */
+static int blimit = 10;         /* Maximum callbacks per softirq. */
-static long qhimark = 10000;    /* If this many pending, ignore blimit. */
+static int qhimark = 10000;     /* If this many pending, ignore blimit. */
-static long qlowmark = 100;     /* Once only this many pending, use blimit. */
+static int qlowmark = 100;      /* Once only this many pending, use blimit. */
-module_param(blimit, long, 0444);
-module_param(qhimark, long, 0444);
-module_param(qlowmark, long, 0444);
-int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+module_param(blimit, int, 0);
-int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
+int rcu_cpu_stall_suppress __read_mostly;
 module_param(rcu_cpu_stall_suppress, int, 0644);
-module_param(rcu_cpu_stall_timeout, int, 0644);
-static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
-static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
-module_param(jiffies_till_first_fqs, ulong, 0644);
-module_param(jiffies_till_next_fqs, ulong, 0644);
-static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
-static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(int cpu);
 /*
@@ -256,7 +230,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 */
 void rcu_bh_force_quiescent_state(void)
 {
-        force_quiescent_state(&rcu_bh_state);
+        force_quiescent_state(&rcu_bh_state, 0);
 }
 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
@@ -290,7 +264,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_progress);
 */
 void rcu_sched_force_quiescent_state(void)
 {
-        force_quiescent_state(&rcu_sched_state);
+        force_quiescent_state(&rcu_sched_state, 0);
 }
 EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
@@ -300,8 +274,7 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
 static int
 cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
 {
-        return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
+        return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
-               rdp->nxttail[RCU_DONE_TAIL] != NULL;
 }
 /*
@@ -310,12 +283,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
 static int
 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-        struct rcu_head **ntp;
+        return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp);
-        ntp = rdp->nxttail[RCU_DONE_TAIL +
-                           (ACCESS_ONCE(rsp->completed) != rdp->completed)];
-        return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp &&
-               !rcu_gp_in_progress(rsp);
 }
 /*
@@ -326,294 +294,103 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
        return &rsp->node[0];
 }
-/*
+#ifdef CONFIG_SMP
- * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
- *
- * If the new value of the ->dynticks_nesting counter now is zero,
- * we really have entered idle, and must do the appropriate accounting.
- * The caller must have disabled interrupts.
- */
-static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
-                                bool user)
-{
-        trace_rcu_dyntick("Start", oldval, 0);
-        if (!user && !is_idle_task(current)) {
-                struct task_struct *idle = idle_task(smp_processor_id());
-                trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
-                ftrace_dump(DUMP_ORIG);
-                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
-                          current->pid, current->comm,
-                          idle->pid, idle->comm); /* must be idle task! */
-        }
-        rcu_prepare_for_idle(smp_processor_id());
-        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
-        smp_mb__before_atomic_inc();  /* See above. */
-        atomic_inc(&rdtp->dynticks);
-        smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
-        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
-        /*
-         * It is illegal to enter an extended quiescent state while
-         * in an RCU read-side critical section.
-         */
-        rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
-                           "Illegal idle entry in RCU read-side critical section.");
-        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
-                           "Illegal idle entry in RCU-bh read-side critical section.");
-        rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
-                           "Illegal idle entry in RCU-sched read-side critical section.");
-}
 /*
- * Enter an RCU extended quiescent state, which can be either the
+ * If the specified CPU is offline, tell the caller that it is in
- * idle loop or adaptive-tickless usermode execution.
+ * a quiescent state.  Otherwise, whack it with a reschedule IPI.
+ * Grace periods can end up waiting on an offline CPU when that
+ * CPU is in the process of coming online -- it will be added to the
+ * rcu_node bitmasks before it actually makes it online.  The same thing
+ * can happen while a CPU is in the process of coming online.  Because this
+ * race is quite rare, we check for it after detecting that the grace
+ * period has been delayed rather than checking each and every CPU
+ * each and every time we start a new grace period.
 */
-static void rcu_eqs_enter(bool user)
+static int rcu_implicit_offline_qs(struct rcu_data *rdp)
 {
-        long long oldval;
+        /*
-        struct rcu_dynticks *rdtp;
+         * If the CPU is offline, it is in a quiescent state.  We can
+         * trust its state not to change because interrupts are disabled.
+         */
+        if (cpu_is_offline(rdp->cpu)) {
+                rdp->offline_fqs++;
+                return 1;
+        }
-        rdtp = &__get_cpu_var(rcu_dynticks);
+        /* If preemptible RCU, no point in sending reschedule IPI. */
-        oldval = rdtp->dynticks_nesting;
+        if (rdp->preemptible)
-        WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
+                return 0;
-        if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
-                rdtp->dynticks_nesting = 0;
+        /* The CPU is online, so send it a reschedule IPI. */
+        if (rdp->cpu != smp_processor_id())
+                smp_send_reschedule(rdp->cpu);
        else
-                rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
+                set_need_resched();
-        rcu_eqs_enter_common(rdtp, oldval, user);
+        rdp->resched_ipi++;
+        return 0;
 }
-/**
+#endif /* #ifdef CONFIG_SMP */
- * rcu_idle_enter - inform RCU that current CPU is entering idle
- *
- * Enter idle mode, in other words, -leave- the mode in which RCU
- * read-side critical sections can occur.  (Though RCU read-side
- * critical sections can occur in irq handlers in idle, a possibility
- * handled by irq_enter() and irq_exit().)
- *
- * We crowbar the ->dynticks_nesting field to zero to allow for
- * the possibility of usermode upcalls having messed up our count
- * of interrupt nesting level during the prior busy period.
- */
-void rcu_idle_enter(void)
-{
-        unsigned long flags;
-        local_irq_save(flags);
+#ifdef CONFIG_NO_HZ
-        rcu_eqs_enter(false);
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-#ifdef CONFIG_RCU_USER_QS
-/**
- * rcu_user_enter - inform RCU that we are resuming userspace.
- *
- * Enter RCU idle mode right before resuming userspace.  No use of RCU
- * is permitted between this call and rcu_user_exit(). This way the
- * CPU doesn't need to maintain the tick for RCU maintenance purposes
- * when the CPU runs in userspace.
- */
-void rcu_user_enter(void)
-{
-        rcu_eqs_enter(1);
-}
 /**
- * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
+ * rcu_enter_nohz - inform RCU that current CPU is entering nohz
- * after the current irq returns.
 *
- * This is similar to rcu_user_enter() but in the context of a non-nesting
+ * Enter nohz mode, in other words, -leave- the mode in which RCU
- * irq. After this call, RCU enters into idle mode when the interrupt
+ * read-side critical sections can occur.  (Though RCU read-side
- * returns.
+ * critical sections can occur in irq handlers in nohz mode, a possibility
+ * handled by rcu_irq_enter() and rcu_irq_exit()).
 */
-void rcu_user_enter_after_irq(void)
+void rcu_enter_nohz(void)
 {
        unsigned long flags;
        struct rcu_dynticks *rdtp;
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-        /* Ensure this irq is interrupting a non-idle RCU state.  */
+        if (--rdtp->dynticks_nesting) {
-        WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
+                local_irq_restore(flags);
-        rdtp->dynticks_nesting = 1;
+                return;
+        }
+        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+        smp_mb__before_atomic_inc();  /* See above. */
+        atomic_inc(&rdtp->dynticks);
+        smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
+        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
        local_irq_restore(flags);
+        /* If the interrupt queued a callback, get out of dyntick mode. */
+        if (in_irq() &&
+            (__get_cpu_var(rcu_sched_data).nxtlist ||
+             __get_cpu_var(rcu_bh_data).nxtlist ||
+             rcu_preempt_needs_cpu(smp_processor_id())))
+                set_need_resched();
 }
-#endif /* CONFIG_RCU_USER_QS */
-/**
+/*
- * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
- *
- * Exit from an interrupt handler, which might possibly result in entering
- * idle mode, in other words, leaving the mode in which read-side critical
- * sections can occur.
- *
- * This code assumes that the idle loop never does anything that might
- * result in unbalanced calls to irq_enter() and irq_exit().  If your
- * architecture violates this assumption, RCU will give you what you
- * deserve, good and hard.  But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
 *
- * You have been warned.
+ * Exit nohz mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections normally occur.
 */
-void rcu_irq_exit(void)
+void rcu_exit_nohz(void)
 {
        unsigned long flags;
-        long long oldval;
        struct rcu_dynticks *rdtp;
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-        oldval = rdtp->dynticks_nesting;
+        if (rdtp->dynticks_nesting++) {
-        rdtp->dynticks_nesting--;
+                local_irq_restore(flags);
-        WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+                return;
-        if (rdtp->dynticks_nesting)
+        }
-                trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
-        else
-                rcu_eqs_enter_common(rdtp, oldval, true);
-        local_irq_restore(flags);
-}
-/*
- * rcu_eqs_exit_common - current CPU moving away from extended quiescent state
- *
- * If the new value of the ->dynticks_nesting counter was previously zero,
- * we really have exited idle, and must do the appropriate accounting.
- * The caller must have disabled interrupts.
- */
-static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
-                               int user)
-{
        smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
        atomic_inc(&rdtp->dynticks);
        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
        smp_mb__after_atomic_inc();  /* See above. */
        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
-        rcu_cleanup_after_idle(smp_processor_id());
-        trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
-        if (!user && !is_idle_task(current)) {
-                struct task_struct *idle = idle_task(smp_processor_id());
-                trace_rcu_dyntick("Error on exit: not idle task",
-                                  oldval, rdtp->dynticks_nesting);
-                ftrace_dump(DUMP_ORIG);
-                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
-                          current->pid, current->comm,
-                          idle->pid, idle->comm); /* must be idle task! */
-        }
-}
-/*
- * Exit an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- */
-static void rcu_eqs_exit(bool user)
-{
-        struct rcu_dynticks *rdtp;
-        long long oldval;
-        rdtp = &__get_cpu_var(rcu_dynticks);
-        oldval = rdtp->dynticks_nesting;
-        WARN_ON_ONCE(oldval < 0);
-        if (oldval & DYNTICK_TASK_NEST_MASK)
-                rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
-        else
-                rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-        rcu_eqs_exit_common(rdtp, oldval, user);
-}
-/**
- * rcu_idle_exit - inform RCU that current CPU is leaving idle
- *
- * Exit idle mode, in other words, -enter- the mode in which RCU
- * read-side critical sections can occur.
- *
- * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
- * allow for the possibility of usermode upcalls messing up our count
- * of interrupt nesting level during the busy period that is just
- * now starting.
- */
-void rcu_idle_exit(void)
-{
-        unsigned long flags;
-        local_irq_save(flags);
-        rcu_eqs_exit(false);
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
-#ifdef CONFIG_RCU_USER_QS
-/**
- * rcu_user_exit - inform RCU that we are exiting userspace.
- *
- * Exit RCU idle mode while entering the kernel because it can
- * run a RCU read side critical section anytime.
- */
-void rcu_user_exit(void)
-{
-        rcu_eqs_exit(1);
-}
-/**
- * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
- * idle mode after the current non-nesting irq returns.
- *
- * This is similar to rcu_user_exit() but in the context of an irq.
- * This is called when the irq has interrupted a userspace RCU idle mode
- * context. When the current non-nesting interrupt returns after this call,
- * the CPU won't restore the RCU idle mode.
- */
-void rcu_user_exit_after_irq(void)
-{
-        unsigned long flags;
-        struct rcu_dynticks *rdtp;
-        local_irq_save(flags);
-        rdtp = &__get_cpu_var(rcu_dynticks);
-        /* Ensure we are interrupting an RCU idle mode. */
-        WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
-        rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
-        local_irq_restore(flags);
-}
-#endif /* CONFIG_RCU_USER_QS */
-/**
- * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
- *
- * Enter an interrupt handler, which might possibly result in exiting
- * idle mode, in other words, entering the mode in which read-side critical
- * sections can occur.
- *
- * Note that the Linux kernel is fully capable of entering an interrupt
- * handler that it never exits, for example when doing upcalls to
- * user mode!  This code assumes that the idle loop never does upcalls to
- * user mode.  If your architecture does do upcalls from the idle loop (or
- * does anything else that results in unbalanced calls to the irq_enter()
- * and irq_exit() functions), RCU will give you what you deserve, good
- * and hard.  But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- */
-void rcu_irq_enter(void)
-{
-        unsigned long flags;
-        struct rcu_dynticks *rdtp;
-        long long oldval;
-        local_irq_save(flags);
-        rdtp = &__get_cpu_var(rcu_dynticks);
-        oldval = rdtp->dynticks_nesting;
-        rdtp->dynticks_nesting++;
-        WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
-        if (oldval)
-                trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
-        else
-                rcu_eqs_exit_common(rdtp, oldval, true);
        local_irq_restore(flags);
 }
@@ -661,77 +438,30 @@ void rcu_nmi_exit(void)
 }
 /**
- * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
+ * rcu_irq_enter - inform RCU of entry to hard irq context
- *
- * If the current CPU is in its idle loop and is neither in an interrupt
- * or NMI handler, return true.
- */
-int rcu_is_cpu_idle(void)
-{
-        int ret;
-        preempt_disable();
-        ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
-        preempt_enable();
-        return ret;
-}
-EXPORT_SYMBOL(rcu_is_cpu_idle);
-#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
-/*
- * Is the current CPU online?  Disable preemption to avoid false positives
- * that could otherwise happen due to the current CPU number being sampled,
- * this task being preempted, its old CPU being taken offline, resuming
- * on some other CPU, then determining that its old CPU is now offline.
- * It is OK to use RCU on an offline processor during initial boot, hence
- * the check for rcu_scheduler_fully_active.  Note also that it is OK
- * for a CPU coming online to use RCU for one jiffy prior to marking itself
- * online in the cpu_online_mask.  Similarly, it is OK for a CPU going
- * offline to continue to use RCU for one jiffy after marking itself
- * offline in the cpu_online_mask.  This leniency is necessary given the
- * non-atomic nature of the online and offline processing, for example,
- * the fact that a CPU enters the scheduler after completing the CPU_DYING
- * notifiers.
- *
- * This is also why RCU internally marks CPUs online during the
- * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
 *
- * Disable checking if in an NMI handler because we cannot safely report
+ * If the CPU was idle with dynamic ticks active, this updates the
- * errors from NMI handlers anyway.
+ * rdtp->dynticks to let the RCU handling know that the CPU is active.
 */
-bool rcu_lockdep_current_cpu_online(void)
+void rcu_irq_enter(void)
 {
-        struct rcu_data *rdp;
+        rcu_exit_nohz();
-        struct rcu_node *rnp;
-        bool ret;
-        if (in_nmi())
-                return 1;
-        preempt_disable();
-        rdp = &__get_cpu_var(rcu_sched_data);
-        rnp = rdp->mynode;
-        ret = (rdp->grpmask & rnp->qsmaskinit) ||
-              !rcu_scheduler_fully_active;
-        preempt_enable();
-        return ret;
 }
-EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
-#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
 /**
- * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
+ * rcu_irq_exit - inform RCU of exit from hard irq context
 *
- * If the current CPU is idle or running at a first-level (not nested)
+ * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
- * interrupt from idle, return true.  The caller must have at least
+ * to put let the RCU handling be aware that the CPU is going back to idle
- * disabled preemption.
+ * with no ticks.
 */
-int rcu_is_cpu_rrupt_from_idle(void)
+void rcu_irq_exit(void)
 {
-        return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
+        rcu_enter_nohz();
 }
+#ifdef CONFIG_SMP
 /*
 * Snapshot the specified CPU's dynticks counter so that we can later
 * credit them with an implicit quiescent state.  Return 1 if this CPU
@@ -740,22 +470,22 @@ int rcu_is_cpu_rrupt_from_idle(void)
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
        rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
-        return (rdp->dynticks_snap & 0x1) == 0;
+        return 0;
 }
 /*
 * Return true if the specified CPU has passed through a quiescent
 * state by virtue of being in or having passed through an dynticks
 * idle state since the last call to dyntick_save_progress_counter()
- * for this same CPU, or by virtue of having been offline.
+ * for this same CPU.
 */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
-        unsigned int curr;
+        unsigned long curr;
-        unsigned int snap;
+        unsigned long snap;
-        curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
+        curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
-        snap = (unsigned int)rdp->dynticks_snap;
+        snap = (unsigned long)rdp->dynticks_snap;
        /*
         * If the CPU passed through or entered a dynticks idle phase with
@@ -765,79 +495,41 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
         * read-side critical section that started before the beginning
         * of the current RCU grace period.
         */
-        if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
+        if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
-                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
                rdp->dynticks_fqs++;
                return 1;
        }
-        /*
+        /* Go check for the CPU being offline. */
-         * Check for the CPU being offline, but only if the grace period
+        return rcu_implicit_offline_qs(rdp);
-         * is old enough.  We don't need to worry about the CPU changing
-         * state: If we see it offline even once, it has been through a
-         * quiescent state.
-         *
-         * The reason for insisting that the grace period be at least
-         * one jiffy old is that CPUs that are not quite online and that
-         * have just gone offline can still execute RCU read-side critical
-         * sections.
-         */
-        if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
-                return 0;  /* Grace period is not old enough. */
-        barrier();
-        if (cpu_is_offline(rdp->cpu)) {
-                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
-                rdp->offline_fqs++;
-                return 1;
-        }
-        return 0;
 }
-static int jiffies_till_stall_check(void)
+#endif /* #ifdef CONFIG_SMP */
-{
-        int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
-        /*
+#else /* #ifdef CONFIG_NO_HZ */
-         * Limit check must be consistent with the Kconfig limits
-         * for CONFIG_RCU_CPU_STALL_TIMEOUT.
-         */
-        if (till_stall_check < 3) {
-                ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
-                till_stall_check = 3;
-        } else if (till_stall_check > 300) {
-                ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
-                till_stall_check = 300;
-        }
-        return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
-}
-static void record_gp_stall_check_time(struct rcu_state *rsp)
+#ifdef CONFIG_SMP
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
-        rsp->gp_start = jiffies;
+        return 0;
-        rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
 }
-/*
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
- * Dump stacks of all tasks running on stalled CPUs.  This is a fallback
- * for architectures that do not implement trigger_all_cpu_backtrace().
- * The NMI-triggered stack traces are more accurate because they are
- * printed by the target CPU.
- */
-static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 {
-        int cpu;
+        return rcu_implicit_offline_qs(rdp);
-        unsigned long flags;
+}
-        struct rcu_node *rnp;
-        rcu_for_each_leaf_node(rsp, rnp) {
+#endif /* #ifdef CONFIG_SMP */
-                raw_spin_lock_irqsave(&rnp->lock, flags);
-                if (rnp->qsmask != 0) {
+#endif /* #else #ifdef CONFIG_NO_HZ */
-                        for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
-                                if (rnp->qsmask & (1UL << cpu))
+int rcu_cpu_stall_suppress __read_mostly;
-                                        dump_cpu_task(rnp->grplo + cpu);
-                }
+static void record_gp_stall_check_time(struct rcu_state *rsp)
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+{
-        }
+        rsp->gp_start = jiffies;
+        rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
 }
 static void print_other_cpu_stall(struct rcu_state *rsp)
@@ -845,9 +537,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        int cpu;
        long delta;
        unsigned long flags;
-        int ndetected = 0;
        struct rcu_node *rnp = rcu_get_root(rsp);
-        long totqlen = 0;
        /* Only let one CPU complain about others per time interval. */
@@ -857,7 +547,13 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
-        rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
+        rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+        /*
+         * Now rat on any tasks that got kicked up to the root rcu_node
+         * due to CPU offlining.
+         */
+        rcu_print_task_stall(rnp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /*
@@ -865,77 +561,47 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         * See Documentation/RCU/stallwarn.txt for info on how to debug
         * RCU CPU stall warnings.
         */
-        printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:",
+        printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
               rsp->name);
-        print_cpu_stall_info_begin();
        rcu_for_each_leaf_node(rsp, rnp) {
                raw_spin_lock_irqsave(&rnp->lock, flags);
-                ndetected += rcu_print_task_stall(rnp);
+                rcu_print_task_stall(rnp);
-                if (rnp->qsmask != 0) {
-                        for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
-                                if (rnp->qsmask & (1UL << cpu)) {
-                                        print_cpu_stall_info(rsp,
-                                                             rnp->grplo + cpu);
-                                        ndetected++;
-                                }
-                }
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                if (rnp->qsmask == 0)
+                        continue;
+                for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
+                        if (rnp->qsmask & (1UL << cpu))
+                                printk(" %d", rnp->grplo + cpu);
        }
+        printk("} (detected by %d, t=%ld jiffies)\n",
+               smp_processor_id(), (long)(jiffies - rsp->gp_start));
+        trigger_all_cpu_backtrace();
-        /*
+        /* If so configured, complain about tasks blocking the grace period. */
-         * Now rat on any tasks that got kicked up to the root rcu_node
-         * due to CPU offlining.
-         */
-        rnp = rcu_get_root(rsp);
-        raw_spin_lock_irqsave(&rnp->lock, flags);
-        ndetected += rcu_print_task_stall(rnp);
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        print_cpu_stall_info_end();
-        for_each_possible_cpu(cpu)
-                totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
-        pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
-               smp_processor_id(), (long)(jiffies - rsp->gp_start),
-               rsp->gpnum, rsp->completed, totqlen);
-        if (ndetected == 0)
-                printk(KERN_ERR "INFO: Stall ended before state dump start\n");
-        else if (!trigger_all_cpu_backtrace())
-                rcu_dump_cpu_stacks(rsp);
-        /* Complain about tasks blocking the grace period. */
        rcu_print_detail_task_stall(rsp);
-        force_quiescent_state(rsp);  /* Kick them all. */
+        force_quiescent_state(rsp, 0);  /* Kick them all. */
 }
 static void print_cpu_stall(struct rcu_state *rsp)
 {
-        int cpu;
        unsigned long flags;
        struct rcu_node *rnp = rcu_get_root(rsp);
-        long totqlen = 0;
        /*
         * OK, time to rat on ourselves...
         * See Documentation/RCU/stallwarn.txt for info on how to debug
         * RCU CPU stall warnings.
         */
-        printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name);
+        printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
-        print_cpu_stall_info_begin();
+               rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
-        print_cpu_stall_info(rsp, smp_processor_id());
+        trigger_all_cpu_backtrace();
-        print_cpu_stall_info_end();
-        for_each_possible_cpu(cpu)
-                totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
-        pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
-                jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
-        if (!trigger_all_cpu_backtrace())
-                dump_stack();
        raw_spin_lock_irqsave(&rnp->lock, flags);
        if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
-                rsp->jiffies_stall = jiffies +
+                rsp->jiffies_stall =
-                                     3 * jiffies_till_stall_check() + 3;
+                        jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        set_need_resched();  /* kick ourselves to get things going. */
@@ -952,8 +618,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
        j = ACCESS_ONCE(jiffies);
        js = ACCESS_ONCE(rsp->jiffies_stall);
        rnp = rdp->mynode;
-        if (rcu_gp_in_progress(rsp) &&
+        if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
-            (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
                /* We haven't checked in, so go dump stack. */
                print_cpu_stall(rsp);
@@ -983,10 +648,9 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
 */
 void rcu_cpu_stall_reset(void)
 {
-        struct rcu_state *rsp;
+        rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+        rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
-        for_each_rcu_flavor(rsp)
+        rcu_preempt_stall_reset();
-                rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
 }
 static struct notifier_block rcu_panic_block = {
@@ -1014,10 +678,11 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
                 * go looking for one.
                 */
                rdp->gpnum = rnp->gpnum;
-                trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
+                if (rnp->qsmask & rdp->grpmask) {
-                rdp->passed_quiesce = 0;
+                        rdp->qs_pending = 1;
-                rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
+                        rdp->passed_quiesc = 0;
-                zero_cpu_stall_ticks(rdp);
+                } else
+                        rdp->qs_pending = 0;
        }
 }
@@ -1058,19 +723,6 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
 }
 /*
- * Initialize the specified rcu_data structure's callback list to empty.
- */
-static void init_callback_list(struct rcu_data *rdp)
-{
-        int i;
-        rdp->nxtlist = NULL;
-        for (i = 0; i < RCU_NEXT_SIZE; i++)
-                rdp->nxttail[i] = &rdp->nxtlist;
-        init_nocb_callback_list(rdp);
-}
-/*
 * Advance this CPU's callbacks, but only if the current grace period
 * has ended.  This may be called only from the CPU to whom the rdp
 * belongs.  In addition, the corresponding leaf rcu_node structure's
@@ -1089,7 +741,6 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
-                trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
                /*
                 * If we were in an extended quiescent state, we may have
@@ -1097,13 +748,10 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
                 * our behalf. Catch up with this state to avoid noting
                 * spurious new grace periods.  If another grace period
                 * has started, then rnp->gpnum will have advanced, so
-                 * we will detect this later on.  Of course, any quiescent
+                 * we will detect this later on.
-                 * states we found for the old GP are now invalid.
                 */
-                if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) {
+                if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
                        rdp->gpnum = rdp->completed;
-                        rdp->passed_quiesce = 0;
-                }
                /*
                 * If RCU does not need a quiescent state from this CPU,
@@ -1147,272 +795,120 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
        /* Prior grace period ended, so advance callbacks for current CPU. */
        __rcu_process_gp_end(rsp, rnp, rdp);
+        /*
+         * Because this CPU just now started the new grace period, we know
+         * that all of its callbacks will be covered by this upcoming grace
+         * period, even the ones that were registered arbitrarily recently.
+         * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
+         *
+         * Other CPUs cannot be sure exactly when the grace period started.
+         * Therefore, their recently registered callbacks must pass through
+         * an additional RCU_NEXT_READY stage, so that they will be handled
+         * by the next RCU grace period.
+         */
+        rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+        rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
        /* Set state so that this CPU will detect the next quiescent state. */
        __note_new_gpnum(rsp, rnp, rdp);
 }
 /*
- * Initialize a new grace period.
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period.  The caller must hold
+ * the root node's ->lock, which is released before return.  Hard irqs must
+ * be disabled.
 */
-static int rcu_gp_init(struct rcu_state *rsp)
+static void
+rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
+        __releases(rcu_get_root(rsp)->lock)
 {
-        struct rcu_data *rdp;
+        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        struct rcu_node *rnp = rcu_get_root(rsp);
-        raw_spin_lock_irq(&rnp->lock);
+        if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
-        rsp->gp_flags = 0; /* Clear all flags: New grace period. */
+                if (cpu_needs_another_gp(rsp, rdp))
+                        rsp->fqs_need_gp = 1;
+                if (rnp->completed == rsp->completed) {
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                        return;
+                }
+                raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
-        if (rcu_gp_in_progress(rsp)) {
+                /*
-                /* Grace period already in progress, don't start another.  */
+                 * Propagate new ->completed value to rcu_node structures
-                raw_spin_unlock_irq(&rnp->lock);
+                 * so that other CPUs don't have to wait until the start
-                return 0;
+                 * of the next grace period to process their callbacks.
+                 */
+                rcu_for_each_node_breadth_first(rsp, rnp) {
+                        raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                        rnp->completed = rsp->completed;
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                }
+                local_irq_restore(flags);
+                return;
        }
        /* Advance to a new grace period and initialize state. */
        rsp->gpnum++;
-        trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
+        WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
+        rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
+        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
        record_gp_stall_check_time(rsp);
-        raw_spin_unlock_irq(&rnp->lock);
+        /* Special-case the common single-level case. */
+        if (NUM_RCU_NODES == 1) {
+                rcu_preempt_check_blocked_tasks(rnp);
+                rnp->qsmask = rnp->qsmaskinit;
+                rnp->gpnum = rsp->gpnum;
+                rnp->completed = rsp->completed;
+                rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
+                rcu_start_gp_per_cpu(rsp, rnp, rdp);
+                rcu_preempt_boost_start_gp(rnp);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                return;
+        }
+        raw_spin_unlock(&rnp->lock);  /* leave irqs disabled. */
        /* Exclude any concurrent CPU-hotplug operations. */
-        mutex_lock(&rsp->onoff_mutex);
+        raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
        /*
         * Set the quiescent-state-needed bits in all the rcu_node
-         * structures for all currently online CPUs in breadth-first order,
+         * structures for all currently online CPUs in breadth-first
-         * starting from the root rcu_node structure, relying on the layout
+         * order, starting from the root rcu_node structure.  This
-         * of the tree within the rsp->node[] array.  Note that other CPUs
+         * operation relies on the layout of the hierarchy within the
-         * will access only the leaves of the hierarchy, thus seeing that no
+         * rsp->node[] array.  Note that other CPUs will access only
+         * the leaves of the hierarchy, which still indicate that no
         * grace period is in progress, at least until the corresponding
         * leaf node has been initialized.  In addition, we have excluded
         * CPU-hotplug operations.
         *
-         * The grace period cannot complete until the initialization
+         * Note that the grace period cannot complete until we finish
-         * process finishes, because this kthread handles both.
+         * the initialization process, as there will be at least one
+         * qsmask bit set in the root node until that time, namely the
+         * one corresponding to this CPU, due to the fact that we have
+         * irqs disabled.
         */
        rcu_for_each_node_breadth_first(rsp, rnp) {
-                raw_spin_lock_irq(&rnp->lock);
+                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
-                rdp = this_cpu_ptr(rsp->rda);
                rcu_preempt_check_blocked_tasks(rnp);
                rnp->qsmask = rnp->qsmaskinit;
                rnp->gpnum = rsp->gpnum;
-                WARN_ON_ONCE(rnp->completed != rsp->completed);
                rnp->completed = rsp->completed;
                if (rnp == rdp->mynode)
                        rcu_start_gp_per_cpu(rsp, rnp, rdp);
                rcu_preempt_boost_start_gp(rnp);
-                trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+                raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
-                                            rnp->level, rnp->grplo,
-                                            rnp->grphi, rnp->qsmask);
-                raw_spin_unlock_irq(&rnp->lock);
-#ifdef CONFIG_PROVE_RCU_DELAY
-                if ((random32() % (rcu_num_nodes * 8)) == 0)
-                        schedule_timeout_uninterruptible(2);
-#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
-                cond_resched();
        }
-        mutex_unlock(&rsp->onoff_mutex);
-        return 1;
-}
-/*
- * Do one round of quiescent-state forcing.
- */
-int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
-{
-        int fqs_state = fqs_state_in;
-        struct rcu_node *rnp = rcu_get_root(rsp);
-        rsp->n_force_qs++;
-        if (fqs_state == RCU_SAVE_DYNTICK) {
-                /* Collect dyntick-idle snapshots. */
-                force_qs_rnp(rsp, dyntick_save_progress_counter);
-                fqs_state = RCU_FORCE_QS;
-        } else {
-                /* Handle dyntick-idle and offline CPUs. */
-                force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
-        }
-        /* Clear flag to prevent immediate re-entry. */
-        if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
-                raw_spin_lock_irq(&rnp->lock);
-                rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
-                raw_spin_unlock_irq(&rnp->lock);
-        }
-        return fqs_state;
-}
-/*
- * Clean up after the old grace period.
- */
-static void rcu_gp_cleanup(struct rcu_state *rsp)
-{
-        unsigned long gp_duration;
-        struct rcu_data *rdp;
-        struct rcu_node *rnp = rcu_get_root(rsp);
-        raw_spin_lock_irq(&rnp->lock);
-        gp_duration = jiffies - rsp->gp_start;
-        if (gp_duration > rsp->gp_max)
-                rsp->gp_max = gp_duration;
-        /*
-         * We know the grace period is complete, but to everyone else
-         * it appears to still be ongoing.  But it is also the case
-         * that to everyone else it looks like there is nothing that
-         * they can do to advance the grace period.  It is therefore
-         * safe for us to drop the lock in order to mark the grace
-         * period as completed in all of the rcu_node structures.
-         */
-        raw_spin_unlock_irq(&rnp->lock);
-        /*
-         * Propagate new ->completed value to rcu_node structures so
-         * that other CPUs don't have to wait until the start of the next
-         * grace period to process their callbacks.  This also avoids
-         * some nasty RCU grace-period initialization races by forcing
-         * the end of the current grace period to be completely recorded in
-         * all of the rcu_node structures before the beginning of the next
-         * grace period is recorded in any of the rcu_node structures.
-         */
-        rcu_for_each_node_breadth_first(rsp, rnp) {
-                raw_spin_lock_irq(&rnp->lock);
-                rnp->completed = rsp->gpnum;
-                raw_spin_unlock_irq(&rnp->lock);
-                cond_resched();
-        }
        rnp = rcu_get_root(rsp);
-        raw_spin_lock_irq(&rnp->lock);
+        raw_spin_lock(&rnp->lock);              /* irqs already disabled. */
+        rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
-        rsp->completed = rsp->gpnum; /* Declare grace period done. */
+        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
-        trace_rcu_grace_period(rsp->name, rsp->completed, "end");
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-        rsp->fqs_state = RCU_GP_IDLE;
-        rdp = this_cpu_ptr(rsp->rda);
-        if (cpu_needs_another_gp(rsp, rdp))
-                rsp->gp_flags = 1;
-        raw_spin_unlock_irq(&rnp->lock);
-}
-/*
- * Body of kthread that handles grace periods.
- */
-static int __noreturn rcu_gp_kthread(void *arg)
-{
-        int fqs_state;
-        unsigned long j;
-        int ret;
-        struct rcu_state *rsp = arg;
-        struct rcu_node *rnp = rcu_get_root(rsp);
-        for (;;) {
-                /* Handle grace-period start. */
-                for (;;) {
-                        wait_event_interruptible(rsp->gp_wq,
-                                                 rsp->gp_flags &
-                                                 RCU_GP_FLAG_INIT);
-                        if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
-                            rcu_gp_init(rsp))
-                                break;
-                        cond_resched();
-                        flush_signals(current);
-                }
-                /* Handle quiescent-state forcing. */
-                fqs_state = RCU_SAVE_DYNTICK;
-                j = jiffies_till_first_fqs;
-                if (j > HZ) {
-                        j = HZ;
-                        jiffies_till_first_fqs = HZ;
-                }
-                for (;;) {
-                        rsp->jiffies_force_qs = jiffies + j;
-                        ret = wait_event_interruptible_timeout(rsp->gp_wq,
-                                        (rsp->gp_flags & RCU_GP_FLAG_FQS) ||
-                                        (!ACCESS_ONCE(rnp->qsmask) &&
-                                         !rcu_preempt_blocked_readers_cgp(rnp)),
-                                        j);
-                        /* If grace period done, leave loop. */
-                        if (!ACCESS_ONCE(rnp->qsmask) &&
-                            !rcu_preempt_blocked_readers_cgp(rnp))
-                                break;
-                        /* If time for quiescent-state forcing, do it. */
-                        if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) {
-                                fqs_state = rcu_gp_fqs(rsp, fqs_state);
-                                cond_resched();
-                        } else {
-                                /* Deal with stray signal. */
-                                cond_resched();
-                                flush_signals(current);
-                        }
-                        j = jiffies_till_next_fqs;
-                        if (j > HZ) {
-                                j = HZ;
-                                jiffies_till_next_fqs = HZ;
-                        } else if (j < 1) {
-                                j = 1;
-                                jiffies_till_next_fqs = 1;
-                        }
-                }
-                /* Handle grace-period end. */
-                rcu_gp_cleanup(rsp);
-        }
-}
-/*
- * Start a new RCU grace period if warranted, re-initializing the hierarchy
- * in preparation for detecting the next grace period.  The caller must hold
- * the root node's ->lock, which is released before return.  Hard irqs must
- * be disabled.
- *
- * Note that it is legal for a dying CPU (which is marked as offline) to
- * invoke this function.  This can happen when the dying CPU reports its
- * quiescent state.
- */
-static void
-rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
-        __releases(rcu_get_root(rsp)->lock)
-{
-        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
-        struct rcu_node *rnp = rcu_get_root(rsp);
-        if (!rsp->gp_kthread ||
-            !cpu_needs_another_gp(rsp, rdp)) {
-                /*
-                 * Either we have not yet spawned the grace-period
-                 * task, this CPU does not need another grace period,
-                 * or a grace period is already in progress.
-                 * Either way, don't start a new grace period.
-                 */
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                return;
-        }
-        /*
-         * Because there is no grace period in progress right now,
-         * any callbacks we have up to this point will be satisfied
-         * by the next grace period.  So promote all callbacks to be
-         * handled after the end of the next grace period.  If the
-         * CPU is not yet aware of the end of the previous grace period,
-         * we need to allow for the callback advancement that will
-         * occur when it does become aware.  Deadlock prevents us from
-         * making it aware at this point: We cannot acquire a leaf
-         * rcu_node ->lock while holding the root rcu_node ->lock.
-         */
-        rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-        if (rdp->completed == rsp->completed)
-                rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-        rsp->gp_flags = RCU_GP_FLAG_INIT;
-        raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
-        /* Ensure that CPU is aware of completion of last grace period. */
-        rcu_process_gp_end(rsp, rdp);
-        local_irq_restore(flags);
-        /* Wake up rcu_gp_kthread() to start the grace period. */
-        wake_up(&rsp->gp_wq);
 }
 /*
@@ -1425,9 +921,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
 {
+        unsigned long gp_duration;
        WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
-        raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
-        wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
+        /*
+         * Ensure that all grace-period and pre-grace-period activity
+         * is seen before the assignment to rsp->completed.
+         */
+        smp_mb(); /* See above block comment. */
+        gp_duration = jiffies - rsp->gp_start;
+        if (gp_duration > rsp->gp_max)
+                rsp->gp_max = gp_duration;
+        rsp->completed = rsp->gpnum;
+        rsp->signaled = RCU_GP_IDLE;
+        rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
 }
 /*
@@ -1454,10 +962,6 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                        return;
                }
                rnp->qsmask &= ~mask;
-                trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
-                                                 mask, rnp->qsmask, rnp->level,
-                                                 rnp->grplo, rnp->grphi,
-                                                 !!rnp->gp_tasks);
                if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
                        /* Other bits still set at this level, so done. */
@@ -1496,7 +1000,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 * based on quiescent states detected in an earlier grace period!
 */
 static void
-rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
+rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
 {
        unsigned long flags;
        unsigned long mask;
@@ -1504,16 +1008,17 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
        rnp = rdp->mynode;
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
+        if (lastcomp != rnp->completed) {
-            rnp->completed == rnp->gpnum) {
                /*
-                 * The grace period in which this quiescent state was
+                 * Someone beat us to it for this grace period, so leave.
-                 * recorded has ended, so don't report it upwards.
+                 * The race with GP start is resolved by the fact that we
-                 * We will instead need a new quiescent state that lies
+                 * hold the leaf rcu_node lock, so that the per-CPU bits
-                 * within the current grace period.
+                 * cannot yet be initialized -- so we would simply find our
+                 * CPU's bit already cleared in rcu_report_qs_rnp() if this
+                 * race occurred.
                 */
-                rdp->passed_quiesce = 0;        /* need qs for new gp. */
+                rdp->passed_quiesc = 0; /* try again later! */
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
@@ -1557,165 +1062,67 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
         * Was there a quiescent state since the beginning of the grace
         * period? If no, then exit and wait for the next call.
         */
-        if (!rdp->passed_quiesce)
+        if (!rdp->passed_quiesc)
                return;
        /*
         * Tell RCU we are done (but rcu_report_qs_rdp() will be the
         * judge of that).
         */
-        rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
+        rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Send the specified CPU's RCU callbacks to the orphanage.  The
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
- * specified CPU must be offline, and the caller must hold the
+ * Synchronization is not required because this function executes
- * ->orphan_lock.
+ * in stop_machine() context.
 */
-static void
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
-rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
-                          struct rcu_node *rnp, struct rcu_data *rdp)
-{
-        /* No-CBs CPUs do not have orphanable callbacks. */
-        if (is_nocb_cpu(rdp->cpu))
-                return;
-        /*
-         * Orphan the callbacks.  First adjust the counts.  This is safe
-         * because _rcu_barrier() excludes CPU-hotplug operations, so it
-         * cannot be running now.  Thus no memory barrier is required.
-         */
-        if (rdp->nxtlist != NULL) {
-                rsp->qlen_lazy += rdp->qlen_lazy;
-                rsp->qlen += rdp->qlen;
-                rdp->n_cbs_orphaned += rdp->qlen;
-                rdp->qlen_lazy = 0;
-                ACCESS_ONCE(rdp->qlen) = 0;
-        }
-        /*
-         * Next, move those callbacks still needing a grace period to
-         * the orphanage, where some other CPU will pick them up.
-         * Some of the callbacks might have gone partway through a grace
-         * period, but that is too bad.  They get to start over because we
-         * cannot assume that grace periods are synchronized across CPUs.
-         * We don't bother updating the ->nxttail[] array yet, instead
-         * we just reset the whole thing later on.
-         */
-        if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
-                *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
-                rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
-                *rdp->nxttail[RCU_DONE_TAIL] = NULL;
-        }
-        /*
-         * Then move the ready-to-invoke callbacks to the orphanage,
-         * where some other CPU will pick them up.  These will not be
-         * required to pass though another grace period: They are done.
-         */
-        if (rdp->nxtlist != NULL) {
-                *rsp->orphan_donetail = rdp->nxtlist;
-                rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
-        }
-        /* Finally, initialize the rcu_data structure's list to empty.  */
-        init_callback_list(rdp);
-}
-/*
- * Adopt the RCU callbacks from the specified rcu_state structure's
- * orphanage.  The caller must hold the ->orphan_lock.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 {
        int i;
-        struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+        /* current DYING CPU is cleared in the cpu_online_mask */
+        int receive_cpu = cpumask_any(cpu_online_mask);
-        /* No-CBs CPUs are handled specially. */
+        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
-        if (rcu_nocb_adopt_orphan_cbs(rsp, rdp))
+        struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
-                return;
-        /* Do the accounting first. */
-        rdp->qlen_lazy += rsp->qlen_lazy;
-        rdp->qlen += rsp->qlen;
-        rdp->n_cbs_adopted += rsp->qlen;
-        if (rsp->qlen_lazy != rsp->qlen)
-                rcu_idle_count_callbacks_posted();
-        rsp->qlen_lazy = 0;
-        rsp->qlen = 0;
-        /*
-         * We do not need a memory barrier here because the only way we
-         * can get here if there is an rcu_barrier() in flight is if
-         * we are the task doing the rcu_barrier().
-         */
-        /* First adopt the ready-to-invoke callbacks. */
-        if (rsp->orphan_donelist != NULL) {
-                *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
-                *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
-                for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
-                        if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
-                                rdp->nxttail[i] = rsp->orphan_donetail;
-                rsp->orphan_donelist = NULL;
-                rsp->orphan_donetail = &rsp->orphan_donelist;
-        }
-        /* And then adopt the callbacks that still need a grace period. */
+        if (rdp->nxtlist == NULL)
-        if (rsp->orphan_nxtlist != NULL) {
+                return;  /* irqs disabled, so comparison is stable. */
-                *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
-                rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
-                rsp->orphan_nxtlist = NULL;
-                rsp->orphan_nxttail = &rsp->orphan_nxtlist;
-        }
-}
-/*
+        *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
- * Trace the fact that this CPU is going offline.
+        receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
- */
+        receive_rdp->qlen += rdp->qlen;
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+        receive_rdp->n_cbs_adopted += rdp->qlen;
-{
+        rdp->n_cbs_orphaned += rdp->qlen;
-        RCU_TRACE(unsigned long mask);
-        RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
-        RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
-        RCU_TRACE(mask = rdp->grpmask);
+        rdp->nxtlist = NULL;
-        trace_rcu_grace_period(rsp->name,
+        for (i = 0; i < RCU_NEXT_SIZE; i++)
-                               rnp->gpnum + 1 - !!(rnp->qsmask & mask),
+                rdp->nxttail[i] = &rdp->nxtlist;
-                               "cpuofl");
+        rdp->qlen = 0;
 }
 /*
- * The CPU has been completely removed, and some other CPU is reporting
+ * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
- * this fact from process context.  Do the remainder of the cleanup,
+ * and move all callbacks from the outgoing CPU to the current one.
- * including orphaning the outgoing CPU's RCU callbacks, and also
+ * There can only be one CPU hotplug operation at a time, so no other
- * adopting them.  There can only be one CPU hotplug operation at a time,
+ * CPU can be attempting to update rcu_cpu_kthread_task.
- * so no other CPU can be attempting to update rcu_cpu_kthread_task.
 */
-static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
+static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
        unsigned long mask;
        int need_report = 0;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-        struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
+        struct rcu_node *rnp;
-        /* Adjust any no-longer-needed kthreads. */
-        rcu_boost_kthread_setaffinity(rnp, -1);
-        /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
+        rcu_stop_cpu_kthread(cpu);
        /* Exclude any attempts to start a new grace period. */
-        mutex_lock(&rsp->onoff_mutex);
+        raw_spin_lock_irqsave(&rsp->onofflock, flags);
-        raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
-        /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
-        rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
-        rcu_adopt_orphan_cbs(rsp);
        /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+        rnp = rdp->mynode;      /* this is the outgoing CPU's rnp. */
        mask = rdp->grpmask;    /* rnp->grplo is constant. */
        do {
                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
@@ -1736,33 +1143,40 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        /*
         * We still hold the leaf rcu_node structure lock here, and
         * irqs are still disabled.  The reason for this subterfuge is
-         * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
+         * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
         * held leads to deadlock.
         */
-        raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
+        raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
        rnp = rdp->mynode;
        if (need_report & RCU_OFL_TASKS_NORM_GP)
                rcu_report_unblock_qs_rnp(rnp, flags);
        else
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
-                rcu_report_exp_rnp(rsp, rnp, true);
+                rcu_report_exp_rnp(rsp, rnp);
-        WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
+        rcu_node_kthread_setaffinity(rnp, -1);
-                  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
+}
-                  cpu, rdp->qlen, rdp->nxtlist);
-        init_callback_list(rdp);
+/*
-        /* Disallow further callbacks on this CPU. */
+ * Remove the specified CPU from the RCU hierarchy and move any pending
-        rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ * callbacks that it might have to the current CPU.  This code assumes
-        mutex_unlock(&rsp->onoff_mutex);
+ * that at least one CPU in the system will remain running at all times.
+ * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
+ */
+static void rcu_offline_cpu(int cpu)
+{
+        __rcu_offline_cpu(cpu, &rcu_sched_state);
+        __rcu_offline_cpu(cpu, &rcu_bh_state);
+        rcu_preempt_offline_cpu(cpu);
 }
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
 }
-static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
+static void rcu_offline_cpu(int cpu)
 {
 }
@@ -1776,70 +1190,52 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        struct rcu_head *next, *list, **tail;
-        long bl, count, count_lazy;
+        int count;
-        int i;
        /* If no callbacks are ready, just return.*/
-        if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
+        if (!cpu_has_callbacks_ready_to_invoke(rdp))
-                trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
-                trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
-                                    need_resched(), is_idle_task(current),
-                                    rcu_is_callbacks_kthread());
                return;
-        }
        /*
         * Extract the list of ready callbacks, disabling to prevent
         * races with call_rcu() from interrupt handlers.
         */
        local_irq_save(flags);
-        WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
-        bl = rdp->blimit;
-        trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
        list = rdp->nxtlist;
        rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
        *rdp->nxttail[RCU_DONE_TAIL] = NULL;
        tail = rdp->nxttail[RCU_DONE_TAIL];
-        for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
+        for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
-                if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
+                if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
-                        rdp->nxttail[i] = &rdp->nxtlist;
+                        rdp->nxttail[count] = &rdp->nxtlist;
        local_irq_restore(flags);
        /* Invoke callbacks. */
-        count = count_lazy = 0;
+        count = 0;
        while (list) {
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
-                if (__rcu_reclaim(rsp->name, list))
+                __rcu_reclaim(list);
-                        count_lazy++;
                list = next;
-                /* Stop only if limit reached and CPU has something to do. */
+                if (++count >= rdp->blimit)
-                if (++count >= bl &&
-                    (need_resched() ||
-                     (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
                        break;
        }
        local_irq_save(flags);
-        trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
-                            is_idle_task(current),
-                            rcu_is_callbacks_kthread());
        /* Update count, and requeue any remaining callbacks. */
+        rdp->qlen -= count;
+        rdp->n_cbs_invoked += count;
        if (list != NULL) {
                *tail = rdp->nxtlist;
                rdp->nxtlist = list;
-                for (i = 0; i < RCU_NEXT_SIZE; i++)
+                for (count = 0; count < RCU_NEXT_SIZE; count++)
-                        if (&rdp->nxtlist == rdp->nxttail[i])
+                        if (&rdp->nxtlist == rdp->nxttail[count])
-                                rdp->nxttail[i] = tail;
+                                rdp->nxttail[count] = tail;
                        else
                                break;
        }
-        smp_mb(); /* List handling before counting for rcu_barrier(). */
-        rdp->qlen_lazy -= count_lazy;
-        ACCESS_ONCE(rdp->qlen) -= count;
-        rdp->n_cbs_invoked += count;
        /* Reinstate batch limit if we have worked down the excess. */
        if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
@@ -1851,11 +1247,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
                rdp->n_force_qs_snap = rsp->n_force_qs;
        } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
                rdp->qlen_last_fqs_check = rdp->qlen;
-        WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
        local_irq_restore(flags);
-        /* Re-invoke RCU core processing if there are callbacks remaining. */
+        /* Re-raise the RCU softirq if there are callbacks remaining. */
        if (cpu_has_callbacks_ready_to_invoke(rdp))
                invoke_rcu_core();
 }
@@ -1863,17 +1258,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 /*
 * Check to see if this CPU is in a non-context-switch quiescent state
 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule RCU core processing.
+ * Also schedule the RCU softirq handler.
 *
- * This function must be called from hardirq context.  It is normally
+ * This function must be called with hardirqs disabled.  It is normally
 * invoked from the scheduling-clock interrupt.  If rcu_pending returns
 * false, there is no point in invoking rcu_check_callbacks().
 */
 void rcu_check_callbacks(int cpu, int user)
 {
-        trace_rcu_utilization("Start scheduler-tick");
+        if (user ||
-        increment_cpu_stall_ticks();
+            (idle_cpu(cpu) && rcu_scheduler_active &&
-        if (user || rcu_is_cpu_rrupt_from_idle()) {
+             !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
                /*
                 * Get here if this CPU took its interrupt from user
@@ -1904,9 +1299,10 @@ void rcu_check_callbacks(int cpu, int user)
        rcu_preempt_check_callbacks(cpu);
        if (rcu_pending(cpu))
                invoke_rcu_core();
-        trace_rcu_utilization("End scheduler-tick");
 }
+#ifdef CONFIG_SMP
 /*
 * Scan the leaf rcu_node structures, processing dyntick state for any that
 * have not yet encountered a quiescent state, using the function specified.
@@ -1923,7 +1319,6 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
        struct rcu_node *rnp;
        rcu_for_each_leaf_node(rsp, rnp) {
-                cond_resched();
                mask = 0;
                raw_spin_lock_irqsave(&rnp->lock, flags);
                if (!rcu_gp_in_progress(rsp)) {
@@ -1960,55 +1355,99 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
 * Force quiescent states on reluctant CPUs, and also detect which
 * CPUs are in dyntick-idle mode.
 */
-static void force_quiescent_state(struct rcu_state *rsp)
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 {
        unsigned long flags;
-        bool ret;
+        struct rcu_node *rnp = rcu_get_root(rsp);
-        struct rcu_node *rnp;
-        struct rcu_node *rnp_old = NULL;
+        if (!rcu_gp_in_progress(rsp))
+                return;  /* No grace period in progress, nothing to force. */
-        /* Funnel through hierarchy to reduce memory contention. */
+        if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
-        rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+                rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
-        for (; rnp != NULL; rnp = rnp->parent) {
+                return; /* Someone else is already on the job. */
-                ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
-                      !raw_spin_trylock(&rnp->fqslock);
-                if (rnp_old != NULL)
-                        raw_spin_unlock(&rnp_old->fqslock);
-                if (ret) {
-                        rsp->n_force_qs_lh++;
-                        return;
-                }
-                rnp_old = rnp;
        }
-        /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
+        if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
+                goto unlock_fqs_ret; /* no emergency and done recently. */
-        /* Reached the root of the rcu_node tree, acquire lock. */
+        rsp->n_force_qs++;
-        raw_spin_lock_irqsave(&rnp_old->lock, flags);
+        raw_spin_lock(&rnp->lock);  /* irqs already disabled */
-        raw_spin_unlock(&rnp_old->fqslock);
+        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
-        if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+        if(!rcu_gp_in_progress(rsp)) {
-                rsp->n_force_qs_lh++;
+                rsp->n_force_qs_ngp++;
-                raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
+                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
-                return;  /* Someone beat us to it. */
+                goto unlock_fqs_ret;  /* no GP in progress, time updated. */
        }
-        rsp->gp_flags |= RCU_GP_FLAG_FQS;
+        rsp->fqs_active = 1;
-        raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
+        switch (rsp->signaled) {
-        wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
+        case RCU_GP_IDLE:
+        case RCU_GP_INIT:
+                break; /* grace period idle or initializing, ignore. */
+        case RCU_SAVE_DYNTICK:
+                if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
+                        break; /* So gcc recognizes the dead code. */
+                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
+                /* Record dyntick-idle state. */
+                force_qs_rnp(rsp, dyntick_save_progress_counter);
+                raw_spin_lock(&rnp->lock);  /* irqs already disabled */
+                if (rcu_gp_in_progress(rsp))
+                        rsp->signaled = RCU_FORCE_QS;
+                break;
+        case RCU_FORCE_QS:
+                /* Check dyntick-idle state, send IPI to laggarts. */
+                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
+                force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
+                /* Leave state in case more forcing is required. */
+                raw_spin_lock(&rnp->lock);  /* irqs already disabled */
+                break;
+        }
+        rsp->fqs_active = 0;
+        if (rsp->fqs_need_gp) {
+                raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
+                rsp->fqs_need_gp = 0;
+                rcu_start_gp(rsp, flags); /* releases rnp->lock */
+                return;
+        }
+        raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
+unlock_fqs_ret:
+        raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
+}
+#else /* #ifdef CONFIG_SMP */
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+        set_need_resched();
 }
+#endif /* #else #ifdef CONFIG_SMP */
 /*
- * This does the RCU core processing work for the specified rcu_state
+ * This does the RCU processing work from softirq context for the
- * and rcu_data structures.  This may be called only from the CPU to
+ * specified rcu_state and rcu_data structures.  This may be called
- * whom the rdp belongs.
+ * only from the CPU to whom the rdp belongs.
 */
 static void
-__rcu_process_callbacks(struct rcu_state *rsp)
+__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
-        struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
        WARN_ON_ONCE(rdp->beenonline == 0);
        /*
+         * If an RCU GP has gone long enough, go check for dyntick
+         * idle CPUs and, if needed, send resched IPIs.
+         */
+        if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
+                force_quiescent_state(rsp, 1);
+        /*
         * Advance callbacks in response to end of earlier grace
         * period that some other CPU ended.
         */
@@ -2029,26 +1468,24 @@ __rcu_process_callbacks(struct rcu_state *rsp)
 }
 /*
- * Do RCU core processing for the current CPU.
+ * Do softirq processing for the current CPU.
 */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
-        struct rcu_state *rsp;
+        __rcu_process_callbacks(&rcu_sched_state,
+                                &__get_cpu_var(rcu_sched_data));
+        __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+        rcu_preempt_process_callbacks();
-        if (cpu_is_offline(smp_processor_id()))
+        /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
-                return;
+        rcu_needs_cpu_flush();
-        trace_rcu_utilization("Start RCU core");
-        for_each_rcu_flavor(rsp)
-                __rcu_process_callbacks(rsp);
-        trace_rcu_utilization("End RCU core");
 }
 /*
- * Schedule RCU callback invocation.  If the specified type of RCU
+ * Wake up the current CPU's kthread.  This replaces raise_softirq()
- * does not support RCU priority boosting, just do a direct call,
+ * in earlier versions of RCU.  Note that because we are running on
- * otherwise wake up the per-CPU kernel kthread.  Note that because we
+ * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
- * are running on the current CPU with interrupts disabled, the
+ * cannot disappear out from under us.
- * rcu_cpu_kthread_task cannot disappear out from under us.
 */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
@@ -2066,22 +1503,38 @@ static void invoke_rcu_core(void)
        raise_softirq(RCU_SOFTIRQ);
 }
-/*
+static void
- * Handle any core-RCU processing required by a call_rcu() invocation.
+__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
- */
+           struct rcu_state *rsp)
-static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
-                            struct rcu_head *head, unsigned long flags)
 {
+        unsigned long flags;
+        struct rcu_data *rdp;
+        debug_rcu_head_queue(head);
+        head->func = func;
+        head->next = NULL;
+        smp_mb(); /* Ensure RCU update seen before callback registry. */
        /*
-         * If called from an extended quiescent state, invoke the RCU
+         * Opportunistically note grace-period endings and beginnings.
-         * core in order to force a re-evaluation of RCU's idleness.
+         * Note that we might see a beginning right after we see an
+         * end, but never vice versa, since this CPU has to pass through
+         * a quiescent state betweentimes.
         */
-        if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
+        local_irq_save(flags);
-                invoke_rcu_core();
+        rdp = this_cpu_ptr(rsp->rda);
+        /* Add the callback to our list. */
+        *rdp->nxttail[RCU_NEXT_TAIL] = head;
+        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+        rdp->qlen++;
-        /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
+        /* If interrupts were disabled, don't dive into RCU core. */
-        if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
+        if (irqs_disabled_flags(flags)) {
+                local_irq_restore(flags);
                return;
+        }
        /*
         * Force the grace period if too many callbacks or too long waiting.
@@ -2108,69 +1561,12 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
                        rdp->blimit = LONG_MAX;
                        if (rsp->n_force_qs == rdp->n_force_qs_snap &&
                            *rdp->nxttail[RCU_DONE_TAIL] != head)
-                                force_quiescent_state(rsp);
+                                force_quiescent_state(rsp, 0);
                        rdp->n_force_qs_snap = rsp->n_force_qs;
                        rdp->qlen_last_fqs_check = rdp->qlen;
                }
-        }
+        } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
-}
+                force_quiescent_state(rsp, 1);
-/*
- * Helper function for call_rcu() and friends.  The cpu argument will
- * normally be -1, indicating "currently running CPU".  It may specify
- * a CPU only if that CPU is a no-CBs CPU.  Currently, only _rcu_barrier()
- * is expected to specify a CPU.
- */
-static void
-__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
-           struct rcu_state *rsp, int cpu, bool lazy)
-{
-        unsigned long flags;
-        struct rcu_data *rdp;
-        WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
-        debug_rcu_head_queue(head);
-        head->func = func;
-        head->next = NULL;
-        /*
-         * Opportunistically note grace-period endings and beginnings.
-         * Note that we might see a beginning right after we see an
-         * end, but never vice versa, since this CPU has to pass through
-         * a quiescent state betweentimes.
-         */
-        local_irq_save(flags);
-        rdp = this_cpu_ptr(rsp->rda);
-        /* Add the callback to our list. */
-        if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
-                int offline;
-                if (cpu != -1)
-                        rdp = per_cpu_ptr(rsp->rda, cpu);
-                offline = !__call_rcu_nocb(rdp, head, lazy);
-                WARN_ON_ONCE(offline);
-                /* _call_rcu() is illegal on offline CPU; leak the callback. */
-                local_irq_restore(flags);
-                return;
-        }
-        ACCESS_ONCE(rdp->qlen)++;
-        if (lazy)
-                rdp->qlen_lazy++;
-        else
-                rcu_idle_count_callbacks_posted();
-        smp_mb();  /* Count before adding callback for rcu_barrier(). */
-        *rdp->nxttail[RCU_NEXT_TAIL] = head;
-        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
-        if (__is_kfree_rcu_offset((unsigned long)func))
-                trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
-                                         rdp->qlen_lazy, rdp->qlen);
-        else
-                trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
-        /* Go handle any RCU core processing required. */
-        __call_rcu_core(rsp, rdp, head, flags);
        local_irq_restore(flags);
 }
@@ -2179,39 +1575,19 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 */
 void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-        __call_rcu(head, func, &rcu_sched_state, -1, 0);
+        __call_rcu(head, func, &rcu_sched_state);
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);
 /*
- * Queue an RCU callback for invocation after a quicker grace period.
+ * Queue an RCU for invocation after a quicker grace period.
 */
 void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-        __call_rcu(head, func, &rcu_bh_state, -1, 0);
+        __call_rcu(head, func, &rcu_bh_state);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
-/*
- * Because a context switch is a grace period for RCU-sched and RCU-bh,
- * any blocking grace-period wait automatically implies a grace period
- * if there is only one CPU online at any point time during execution
- * of either synchronize_sched() or synchronize_rcu_bh().  It is OK to
- * occasionally incorrectly indicate that there are multiple CPUs online
- * when there was in fact only one the whole time, as this just adds
- * some overhead: RCU still operates correctly.
- */
-static inline int rcu_blocking_is_gp(void)
-{
-        int ret;
-        might_sleep();  /* Check for RCU read-side critical section. */
-        preempt_disable();
-        ret = num_online_cpus() <= 1;
-        preempt_enable();
-        return ret;
-}
 /**
 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
 *
@@ -2224,28 +1600,10 @@ static inline int rcu_blocking_is_gp(void)
 * rcu_read_lock_sched().
 *
 * This means that all preempt_disable code sequences, including NMI and
- * non-threaded hardware-interrupt handlers, in progress on entry will
+ * hardware-interrupt handlers, in progress on entry will have completed
- * have completed before this primitive returns.  However, this does not
+ * before this primitive returns.  However, this does not guarantee that
- * guarantee that softirq handlers will have completed, since in some
+ * softirq handlers will have completed, since in some kernels, these
- * kernels, these handlers can run in process context, and can block.
+ * handlers can run in process context, and can block.
- *
- * Note that this guarantee implies further memory-ordering guarantees.
- * On systems with more than one CPU, when synchronize_sched() returns,
- * each CPU is guaranteed to have executed a full memory barrier since the
- * end of its last RCU-sched read-side critical section whose beginning
- * preceded the call to synchronize_sched().  In addition, each CPU having
- * an RCU read-side critical section that extends beyond the return from
- * synchronize_sched() is guaranteed to have executed a full memory barrier
- * after the beginning of synchronize_sched() and before the beginning of
- * that RCU read-side critical section.  Note that these guarantees include
- * CPUs that are offline, idle, or executing in user mode, as well as CPUs
- * that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked synchronize_sched(), which returned
- * to its caller on CPU B, then both CPU A and CPU B are guaranteed
- * to have executed a full memory barrier during the execution of
- * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
- * again only if the system has more than one CPU).
 *
 * This primitive provides the guarantees made by the (now removed)
 * synchronize_kernel() API.  In contrast, synchronize_rcu() only
@@ -2255,16 +1613,18 @@ static inline int rcu_blocking_is_gp(void)
 */
 void synchronize_sched(void)
 {
-        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+        struct rcu_synchronize rcu;
-                           !lock_is_held(&rcu_lock_map) &&
-                           !lock_is_held(&rcu_sched_lock_map),
-                           "Illegal synchronize_sched() in RCU-sched read-side critical section");
        if (rcu_blocking_is_gp())
                return;
-        if (rcu_expedited)
-                synchronize_sched_expedited();
+        init_rcu_head_on_stack(&rcu.head);
-        else
+        init_completion(&rcu.completion);
-                wait_rcu_gp(call_rcu_sched);
+        /* Will wake me after RCU finished. */
+        call_rcu_sched(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
@@ -2276,181 +1636,23 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
 * read-side critical sections have completed.  RCU read-side critical
 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
 * and may be nested.
- *
- * See the description of synchronize_sched() for more detailed information
- * on memory ordering guarantees.
 */
 void synchronize_rcu_bh(void)
 {
-        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+        struct rcu_synchronize rcu;
-                           !lock_is_held(&rcu_lock_map) &&
-                           !lock_is_held(&rcu_sched_lock_map),
-                           "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
        if (rcu_blocking_is_gp())
                return;
-        if (rcu_expedited)
-                synchronize_rcu_bh_expedited();
-        else
-                wait_rcu_gp(call_rcu_bh);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
-static int synchronize_sched_expedited_cpu_stop(void *data)
+        init_rcu_head_on_stack(&rcu.head);
-{
+        init_completion(&rcu.completion);
-        /*
+        /* Will wake me after RCU finished. */
-         * There must be a full memory barrier on each affected CPU
+        call_rcu_bh(&rcu.head, wakeme_after_rcu);
-         * between the time that try_stop_cpus() is called and the
+        /* Wait for it. */
-         * time that it returns.
+        wait_for_completion(&rcu.completion);
-         *
+        destroy_rcu_head_on_stack(&rcu.head);
-         * In the current initial implementation of cpu_stop, the
-         * above condition is already met when the control reaches
-         * this point and the following smp_mb() is not strictly
-         * necessary.  Do smp_mb() anyway for documentation and
-         * robustness against future implementation changes.
-         */
-        smp_mb(); /* See above comment block. */
-        return 0;
 }
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
-/**
- * synchronize_sched_expedited - Brute-force RCU-sched grace period
- *
- * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
- * approach to force the grace period to end quickly.  This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code.  In fact,
- * if you are using synchronize_sched_expedited() in a loop, please
- * restructure your code to batch your updates, and then use a single
- * synchronize_sched() instead.
- *
- * Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
- * to call this function from a CPU-hotplug notifier.  Failing to observe
- * these restriction will result in deadlock.
- *
- * This implementation can be thought of as an application of ticket
- * locking to RCU, with sync_sched_expedited_started and
- * sync_sched_expedited_done taking on the roles of the halves
- * of the ticket-lock word.  Each task atomically increments
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs.  If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period.  We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done.  If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot.  In this case, our work is
- * done for us, and we can simply return.  Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
- */
-void synchronize_sched_expedited(void)
-{
-        long firstsnap, s, snap;
-        int trycount = 0;
-        struct rcu_state *rsp = &rcu_sched_state;
-        /*
-         * If we are in danger of counter wrap, just do synchronize_sched().
-         * By allowing sync_sched_expedited_started to advance no more than
-         * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
-         * that more than 3.5 billion CPUs would be required to force a
-         * counter wrap on a 32-bit system.  Quite a few more CPUs would of
-         * course be required on a 64-bit system.
-         */
-        if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
-                         (ulong)atomic_long_read(&rsp->expedited_done) +
-                         ULONG_MAX / 8)) {
-                synchronize_sched();
-                atomic_long_inc(&rsp->expedited_wrap);
-                return;
-        }
-        /*
-         * Take a ticket.  Note that atomic_inc_return() implies a
-         * full memory barrier.
-         */
-        snap = atomic_long_inc_return(&rsp->expedited_start);
-        firstsnap = snap;
-        get_online_cpus();
-        WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
-        /*
-         * Each pass through the following loop attempts to force a
-         * context switch on each CPU.
-         */
-        while (try_stop_cpus(cpu_online_mask,
-                             synchronize_sched_expedited_cpu_stop,
-                             NULL) == -EAGAIN) {
-                put_online_cpus();
-                atomic_long_inc(&rsp->expedited_tryfail);
-                /* Check to see if someone else did our work for us. */
-                s = atomic_long_read(&rsp->expedited_done);
-                if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-                        /* ensure test happens before caller kfree */
-                        smp_mb__before_atomic_inc(); /* ^^^ */
-                        atomic_long_inc(&rsp->expedited_workdone1);
-                        return;
-                }
-                /* No joy, try again later.  Or just synchronize_sched(). */
-                if (trycount++ < 10) {
-                        udelay(trycount * num_online_cpus());
-                } else {
-                        wait_rcu_gp(call_rcu_sched);
-                        atomic_long_inc(&rsp->expedited_normal);
-                        return;
-                }
-                /* Recheck to see if someone else did our work for us. */
-                s = atomic_long_read(&rsp->expedited_done);
-                if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-                        /* ensure test happens before caller kfree */
-                        smp_mb__before_atomic_inc(); /* ^^^ */
-                        atomic_long_inc(&rsp->expedited_workdone2);
-                        return;
-                }
-                /*
-                 * Refetching sync_sched_expedited_started allows later
-                 * callers to piggyback on our grace period.  We retry
-                 * after they started, so our grace period works for them,
-                 * and they started after our first try, so their grace
-                 * period works for us.
-                 */
-                get_online_cpus();
-                snap = atomic_long_read(&rsp->expedited_start);
-                smp_mb(); /* ensure read is before try_stop_cpus(). */
-        }
-        atomic_long_inc(&rsp->expedited_stoppedcpus);
-        /*
-         * Everyone up to our most recent fetch is covered by our grace
-         * period.  Update the counter, but only if our work is still
-         * relevant -- which it won't be if someone who started later
-         * than we did already did their update.
-         */
-        do {
-                atomic_long_inc(&rsp->expedited_done_tries);
-                s = atomic_long_read(&rsp->expedited_done);
-                if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
-                        /* ensure test happens before caller kfree */
-                        smp_mb__before_atomic_inc(); /* ^^^ */
-                        atomic_long_inc(&rsp->expedited_done_lost);
-                        break;
-                }
-        } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
-        atomic_long_inc(&rsp->expedited_done_exit);
-        put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 /*
 * Check to see if there is any immediate RCU-related work to be done
@@ -2469,10 +1671,19 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        check_cpu_stall(rsp, rdp);
        /* Is the RCU core waiting for a quiescent state from this CPU? */
-        if (rcu_scheduler_fully_active &&
+        if (rdp->qs_pending && !rdp->passed_quiesc) {
-            rdp->qs_pending && !rdp->passed_quiesce) {
+                /*
+                 * If force_quiescent_state() coming soon and this CPU
+                 * needs a quiescent state, and this is either RCU-sched
+                 * or RCU-bh, force a local reschedule.
+                 */
                rdp->n_rp_qs_pending++;
-        } else if (rdp->qs_pending && rdp->passed_quiesce) {
+                if (!rdp->preemptible &&
+                    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
+                                 jiffies))
+                        set_need_resched();
+        } else if (rdp->qs_pending && rdp->passed_quiesc) {
                rdp->n_rp_report_qs++;
                return 1;
        }
@@ -2501,6 +1712,13 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
                return 1;
        }
+        /* Has an RCU GP gone long enough to send resched IPIs &c? */
+        if (rcu_gp_in_progress(rsp) &&
+            ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
+                rdp->n_rp_need_fqs++;
+                return 1;
+        }
        /* nothing to do */
        rdp->n_rp_need_nothing++;
        return 0;
@@ -2513,12 +1731,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 */
 static int rcu_pending(int cpu)
 {
-        struct rcu_state *rsp;
+        return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
+               __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
-        for_each_rcu_flavor(rsp)
+               rcu_preempt_pending(cpu);
-                if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
-                        return 1;
-        return 0;
 }
 /*
@@ -2526,43 +1741,23 @@ static int rcu_pending(int cpu)
 * by the current CPU, even if none need be done immediately, returning
 * 1 if so.
 */
-static int rcu_cpu_has_callbacks(int cpu)
+static int rcu_needs_cpu_quick_check(int cpu)
 {
-        struct rcu_state *rsp;
        /* RCU callbacks either ready or pending? */
-        for_each_rcu_flavor(rsp)
+        return per_cpu(rcu_sched_data, cpu).nxtlist ||
-                if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
+               per_cpu(rcu_bh_data, cpu).nxtlist ||
-                        return 1;
+               rcu_preempt_needs_cpu(cpu);
-        return 0;
 }
-/*
+static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
- * Helper function for _rcu_barrier() tracing.  If tracing is disabled,
+static atomic_t rcu_barrier_cpu_count;
- * the compiler is expected to optimize this away.
+static DEFINE_MUTEX(rcu_barrier_mutex);
- */
+static struct completion rcu_barrier_completion;
-static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
-                               int cpu, unsigned long done)
-{
-        trace_rcu_barrier(rsp->name, s, cpu,
-                          atomic_read(&rsp->barrier_cpu_count), done);
-}
-/*
+static void rcu_barrier_callback(struct rcu_head *notused)
- * RCU callback function for _rcu_barrier().  If we are last, wake
- * up the task executing _rcu_barrier().
- */
-static void rcu_barrier_callback(struct rcu_head *rhp)
 {
-        struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
+        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
-        struct rcu_state *rsp = rdp->rsp;
+                complete(&rcu_barrier_completion);
-        if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
-                _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
-                complete(&rsp->barrier_completion);
-        } else {
-                _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
-        }
 }
 /*
@@ -2570,116 +1765,45 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
 */
 static void rcu_barrier_func(void *type)
 {
-        struct rcu_state *rsp = type;
+        int cpu = smp_processor_id();
-        struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+        struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
+        void (*call_rcu_func)(struct rcu_head *head,
+                              void (*func)(struct rcu_head *head));
-        _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
+        atomic_inc(&rcu_barrier_cpu_count);
-        atomic_inc(&rsp->barrier_cpu_count);
+        call_rcu_func = type;
-        rsp->call(&rdp->barrier_head, rcu_barrier_callback);
+        call_rcu_func(head, rcu_barrier_callback);
 }
 /*
 * Orchestrate the specified type of RCU barrier, waiting for all
 * RCU callbacks of the specified type to complete.
 */
-static void _rcu_barrier(struct rcu_state *rsp)
+static void _rcu_barrier(struct rcu_state *rsp,
+                         void (*call_rcu_func)(struct rcu_head *head,
+                                               void (*func)(struct rcu_head *head)))
 {
-        int cpu;
+        BUG_ON(in_interrupt());
-        struct rcu_data *rdp;
-        unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
-        unsigned long snap_done;
-        _rcu_barrier_trace(rsp, "Begin", -1, snap);
        /* Take mutex to serialize concurrent rcu_barrier() requests. */
-        mutex_lock(&rsp->barrier_mutex);
+        mutex_lock(&rcu_barrier_mutex);
+        init_completion(&rcu_barrier_completion);
        /*
-         * Ensure that all prior references, including to ->n_barrier_done,
+         * Initialize rcu_barrier_cpu_count to 1, then invoke
-         * are ordered before the _rcu_barrier() machinery.
+         * rcu_barrier_func() on each CPU, so that each CPU also has
+         * incremented rcu_barrier_cpu_count.  Only then is it safe to
+         * decrement rcu_barrier_cpu_count -- otherwise the first CPU
+         * might complete its grace period before all of the other CPUs
+         * did their increment, causing this function to return too
+         * early.  Note that on_each_cpu() disables irqs, which prevents
+         * any CPUs from coming online or going offline until each online
+         * CPU has queued its RCU-barrier callback.
         */
-        smp_mb();  /* See above block comment. */
+        atomic_set(&rcu_barrier_cpu_count, 1);
+        on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
-        /*
+        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
-         * Recheck ->n_barrier_done to see if others did our work for us.
+                complete(&rcu_barrier_completion);
-         * This means checking ->n_barrier_done for an even-to-odd-to-even
+        wait_for_completion(&rcu_barrier_completion);
-         * transition.  The "if" expression below therefore rounds the old
+        mutex_unlock(&rcu_barrier_mutex);
-         * value up to the next even number and adds two before comparing.
-         */
-        snap_done = ACCESS_ONCE(rsp->n_barrier_done);
-        _rcu_barrier_trace(rsp, "Check", -1, snap_done);
-        if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
-                _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
-                smp_mb(); /* caller's subsequent code after above check. */
-                mutex_unlock(&rsp->barrier_mutex);
-                return;
-        }
-        /*
-         * Increment ->n_barrier_done to avoid duplicate work.  Use
-         * ACCESS_ONCE() to prevent the compiler from speculating
-         * the increment to precede the early-exit check.
-         */
-        ACCESS_ONCE(rsp->n_barrier_done)++;
-        WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
-        _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
-        smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
-        /*
-         * Initialize the count to one rather than to zero in order to
-         * avoid a too-soon return to zero in case of a short grace period
-         * (or preemption of this task).  Exclude CPU-hotplug operations
-         * to ensure that no offline CPU has callbacks queued.
-         */
-        init_completion(&rsp->barrier_completion);
-        atomic_set(&rsp->barrier_cpu_count, 1);
-        get_online_cpus();
-        /*
-         * Force each CPU with callbacks to register a new callback.
-         * When that callback is invoked, we will know that all of the
-         * corresponding CPU's preceding callbacks have been invoked.
-         */
-        for_each_possible_cpu(cpu) {
-                if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
-                        continue;
-                rdp = per_cpu_ptr(rsp->rda, cpu);
-                if (is_nocb_cpu(cpu)) {
-                        _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
-                                           rsp->n_barrier_done);
-                        atomic_inc(&rsp->barrier_cpu_count);
-                        __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
-                                   rsp, cpu, 0);
-                } else if (ACCESS_ONCE(rdp->qlen)) {
-                        _rcu_barrier_trace(rsp, "OnlineQ", cpu,
-                                           rsp->n_barrier_done);
-                        smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
-                } else {
-                        _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
-                                           rsp->n_barrier_done);
-                }
-        }
-        put_online_cpus();
-        /*
-         * Now that we have an rcu_barrier_callback() callback on each
-         * CPU, and thus each counted, remove the initial count.
-         */
-        if (atomic_dec_and_test(&rsp->barrier_cpu_count))
-                complete(&rsp->barrier_completion);
-        /* Increment ->n_barrier_done to prevent duplicate work. */
-        smp_mb(); /* Keep increment after above mechanism. */
-        ACCESS_ONCE(rsp->n_barrier_done)++;
-        WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
-        _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
-        smp_mb(); /* Keep increment before caller's subsequent code. */
-        /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
-        wait_for_completion(&rsp->barrier_completion);
-        /* Other rcu_barrier() invocations can now safely proceed. */
-        mutex_unlock(&rsp->barrier_mutex);
 }
 /**
@@ -2687,7 +1811,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 */
 void rcu_barrier_bh(void)
 {
-        _rcu_barrier(&rcu_bh_state);
+        _rcu_barrier(&rcu_bh_state, call_rcu_bh);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
@@ -2696,7 +1820,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);
 */
 void rcu_barrier_sched(void)
 {
-        _rcu_barrier(&rcu_sched_state);
+        _rcu_barrier(&rcu_sched_state, call_rcu_sched);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
@@ -2707,24 +1831,21 @@ static void __init
 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
+        int i;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
-        init_callback_list(rdp);
+        rdp->nxtlist = NULL;
-        rdp->qlen_lazy = 0;
+        for (i = 0; i < RCU_NEXT_SIZE; i++)
-        ACCESS_ONCE(rdp->qlen) = 0;
+                rdp->nxttail[i] = &rdp->nxtlist;
+        rdp->qlen = 0;
+#ifdef CONFIG_NO_HZ
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
-        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
+#endif /* #ifdef CONFIG_NO_HZ */
-        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
-#ifdef CONFIG_RCU_USER_QS
-        WARN_ON_ONCE(rdp->dynticks->in_user);
-#endif
        rdp->cpu = cpu;
-        rdp->rsp = rsp;
-        rcu_boot_init_nocb_percpu_data(rdp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -2742,23 +1863,25 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rcu_get_root(rsp);
-        /* Exclude new grace periods. */
-        mutex_lock(&rsp->onoff_mutex);
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
+        rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
+        rdp->qs_pending = 1;     /*  so set up to respond to current GP. */
        rdp->beenonline = 1;     /* We have now been online. */
        rdp->preemptible = preemptible;
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
-        init_callback_list(rdp);  /* Re-enable callbacks on this CPU. */
-        rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-        atomic_set(&rdp->dynticks->dynticks,
-                   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
-        rcu_prepare_for_idle_init(cpu);
        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
+        /*
+         * A new grace period might start here.  If so, we won't be part
+         * of it, but that is OK, as we are currently in a quiescent state.
+         */
+        /* Exclude any attempts to start a new GP on large systems. */
+        raw_spin_lock(&rsp->onofflock);         /* irqs already disabled. */
        /* Add CPU to rcu_node bitmasks. */
        rnp = rdp->mynode;
        mask = rdp->grpmask;
@@ -2768,32 +1891,22 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
                rnp->qsmaskinit |= mask;
                mask = rnp->grpmask;
                if (rnp == rdp->mynode) {
-                        /*
+                        rdp->gpnum = rnp->completed; /* if GP in progress... */
-                         * If there is a grace period in progress, we will
-                         * set up to wait for it next time we run the
-                         * RCU core code.
-                         */
-                        rdp->gpnum = rnp->completed;
                        rdp->completed = rnp->completed;
-                        rdp->passed_quiesce = 0;
+                        rdp->passed_quiesc_completed = rnp->completed - 1;
-                        rdp->qs_pending = 0;
-                        trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
                }
                raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
                rnp = rnp->parent;
        } while (rnp != NULL && !(rnp->qsmaskinit & mask));
-        local_irq_restore(flags);
-        mutex_unlock(&rsp->onoff_mutex);
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 static void __cpuinit rcu_prepare_cpu(int cpu)
 {
-        struct rcu_state *rsp;
+        rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
+        rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
-        for_each_rcu_flavor(rsp)
+        rcu_preempt_init_percpu_data(cpu);
-                rcu_init_percpu_data(cpu, rsp,
-                                     strcmp(rsp->name, "rcu_preempt") == 0);
 }
 /*
@@ -2805,10 +1918,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        long cpu = (long)hcpu;
        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;
-        struct rcu_state *rsp;
-        int ret = NOTIFY_OK;
-        trace_rcu_utilization("Start CPU hotplug");
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
@@ -2817,13 +1927,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                break;
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
-                rcu_boost_kthread_setaffinity(rnp, -1);
+                rcu_node_kthread_setaffinity(rnp, -1);
+                rcu_cpu_kthread_setrt(cpu, 1);
                break;
        case CPU_DOWN_PREPARE:
-                if (nocb_cpu_expendable(cpu))
+                rcu_node_kthread_setaffinity(rnp, cpu);
-                        rcu_boost_kthread_setaffinity(rnp, cpu);
+                rcu_cpu_kthread_setrt(cpu, 0);
-                else
-                        ret = NOTIFY_BAD;
                break;
        case CPU_DYING:
        case CPU_DYING_FROZEN:
@@ -2832,46 +1941,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                 * touch any data without introducing corruption. We send the
                 * dying CPU's callbacks to an arbitrarily chosen online CPU.
                 */
-                for_each_rcu_flavor(rsp)
+                rcu_send_cbs_to_online(&rcu_bh_state);
-                        rcu_cleanup_dying_cpu(rsp);
+                rcu_send_cbs_to_online(&rcu_sched_state);
-                rcu_cleanup_after_idle(cpu);
+                rcu_preempt_send_cbs_to_online();
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
-                for_each_rcu_flavor(rsp)
+                rcu_offline_cpu(cpu);
-                        rcu_cleanup_dead_cpu(cpu, rsp);
                break;
        default:
                break;
        }
-        trace_rcu_utilization("End CPU hotplug");
+        return NOTIFY_OK;
-        return ret;
-}
-/*
- * Spawn the kthread that handles this RCU flavor's grace periods.
- */
-static int __init rcu_spawn_gp_kthread(void)
-{
-        unsigned long flags;
-        struct rcu_node *rnp;
-        struct rcu_state *rsp;
-        struct task_struct *t;
-        for_each_rcu_flavor(rsp) {
-                t = kthread_run(rcu_gp_kthread, rsp, rsp->name);
-                BUG_ON(IS_ERR(t));
-                rnp = rcu_get_root(rsp);
-                raw_spin_lock_irqsave(&rnp->lock, flags);
-                rsp->gp_kthread = t;
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                rcu_spawn_nocb_kthreads(rsp);
-        }
-        return 0;
 }
-early_initcall(rcu_spawn_gp_kthread);
 /*
 * This function is invoked towards the end of the scheduler's initialization
@@ -2897,9 +1981,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
        int i;
-        for (i = rcu_num_lvls - 1; i > 0; i--)
+        for (i = NUM_RCU_LVLS - 1; i > 0; i--)
                rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-        rsp->levelspread[0] = rcu_fanout_leaf;
+        rsp->levelspread[0] = RCU_FANOUT_LEAF;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -2908,8 +1992,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
        int cprv;
        int i;
-        cprv = nr_cpu_ids;
+        cprv = NR_CPUS;
-        for (i = rcu_num_lvls - 1; i >= 0; i--) {
+        for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
                ccur = rsp->levelcnt[i];
                rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
                cprv = ccur;
@@ -2923,14 +2007,10 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 static void __init rcu_init_one(struct rcu_state *rsp,
                struct rcu_data __percpu *rda)
 {
-        static char *buf[] = { "rcu_node_0",
+        static char *buf[] = { "rcu_node_level_0",
-                               "rcu_node_1",
+                               "rcu_node_level_1",
-                               "rcu_node_2",
+                               "rcu_node_level_2",
-                               "rcu_node_3" };  /* Match MAX_RCU_LVLS */
+                               "rcu_node_level_3" };  /* Match MAX_RCU_LVLS */
-        static char *fqs[] = { "rcu_node_fqs_0",
-                               "rcu_node_fqs_1",
-                               "rcu_node_fqs_2",
-                               "rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */
        int cpustride = 1;
        int i;
        int j;
@@ -2940,26 +2020,20 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        /* Initialize the level-tracking arrays. */
-        for (i = 0; i < rcu_num_lvls; i++)
+        for (i = 1; i < NUM_RCU_LVLS; i++)
-                rsp->levelcnt[i] = num_rcu_lvl[i];
-        for (i = 1; i < rcu_num_lvls; i++)
                rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
        rcu_init_levelspread(rsp);
        /* Initialize the elements themselves, starting from the leaves. */
-        for (i = rcu_num_lvls - 1; i >= 0; i--) {
+        for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
                cpustride *= rsp->levelspread[i];
                rnp = rsp->level[i];
                for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
                        raw_spin_lock_init(&rnp->lock);
                        lockdep_set_class_and_name(&rnp->lock,
                                                   &rcu_node_class[i], buf[i]);
-                        raw_spin_lock_init(&rnp->fqslock);
+                        rnp->gpnum = 0;
-                        lockdep_set_class_and_name(&rnp->fqslock,
-                                                   &rcu_fqs_class[i], fqs[i]);
-                        rnp->gpnum = rsp->gpnum;
-                        rnp->completed = rsp->completed;
                        rnp->qsmask = 0;
                        rnp->qsmaskinit = 0;
                        rnp->grplo = j * cpustride;
@@ -2982,76 +2056,13 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        }
        rsp->rda = rda;
-        init_waitqueue_head(&rsp->gp_wq);
+        rnp = rsp->level[NUM_RCU_LVLS - 1];
-        rnp = rsp->level[rcu_num_lvls - 1];
        for_each_possible_cpu(i) {
                while (i > rnp->grphi)
                        rnp++;
                per_cpu_ptr(rsp->rda, i)->mynode = rnp;
                rcu_boot_init_percpu_data(i, rsp);
        }
-        list_add(&rsp->flavors, &rcu_struct_flavors);
-}
-/*
- * Compute the rcu_node tree geometry from kernel parameters.  This cannot
- * replace the definitions in rcutree.h because those are needed to size
- * the ->node array in the rcu_state structure.
- */
-static void __init rcu_init_geometry(void)
-{
-        int i;
-        int j;
-        int n = nr_cpu_ids;
-        int rcu_capacity[MAX_RCU_LVLS + 1];
-        /* If the compile-time values are accurate, just leave. */
-        if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
-            nr_cpu_ids == NR_CPUS)
-                return;
-        /*
-         * Compute number of nodes that can be handled an rcu_node tree
-         * with the given number of levels.  Setting rcu_capacity[0] makes
-         * some of the arithmetic easier.
-         */
-        rcu_capacity[0] = 1;
-        rcu_capacity[1] = rcu_fanout_leaf;
-        for (i = 2; i <= MAX_RCU_LVLS; i++)
-                rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
-        /*
-         * The boot-time rcu_fanout_leaf parameter is only permitted
-         * to increase the leaf-level fanout, not decrease it.  Of course,
-         * the leaf-level fanout cannot exceed the number of bits in
-         * the rcu_node masks.  Finally, the tree must be able to accommodate
-         * the configured number of CPUs.  Complain and fall back to the
-         * compile-time values if these limits are exceeded.
-         */
-        if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
-            rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
-            n > rcu_capacity[MAX_RCU_LVLS]) {
-                WARN_ON(1);
-                return;
-        }
-        /* Calculate the number of rcu_nodes at each level of the tree. */
-        for (i = 1; i <= MAX_RCU_LVLS; i++)
-                if (n <= rcu_capacity[i]) {
-                        for (j = 0; j <= i; j++)
-                                num_rcu_lvl[j] =
-                                        DIV_ROUND_UP(n, rcu_capacity[i - j]);
-                        rcu_num_lvls = i;
-                        for (j = i + 1; j <= MAX_RCU_LVLS; j++)
-                                num_rcu_lvl[j] = 0;
-                        break;
-                }
-        /* Calculate the total number of rcu_node structures. */
-        rcu_num_nodes = 0;
-        for (i = 0; i <= MAX_RCU_LVLS; i++)
-                rcu_num_nodes += num_rcu_lvl[i];
-        rcu_num_nodes -= n;
 }
 void __init rcu_init(void)
@@ -3059,11 +2070,9 @@ void __init rcu_init(void)
        int cpu;
        rcu_bootup_announce();
-        rcu_init_geometry();
        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
        __rcu_init_preempt();
-        rcu_init_nocb();
         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
        /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4b69291b093..01b2ccda26f 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -29,41 +29,45 @@
 #include <linux/seqlock.h>
 /*
- * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
+ * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
- * CONFIG_RCU_FANOUT_LEAF.
 * In theory, it should be possible to add more levels straightforwardly.
 * In practice, this did work well going from three levels to four.
 * Of course, your mileage may vary.
 */
 #define MAX_RCU_LVLS 4
-#define RCU_FANOUT_1          (CONFIG_RCU_FANOUT_LEAF)
+#if CONFIG_RCU_FANOUT > 16
+#define RCU_FANOUT_LEAF       16
+#else /* #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT)
+#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_1          (RCU_FANOUT_LEAF)
 #define RCU_FANOUT_2          (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
 #define RCU_FANOUT_3          (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
 #define RCU_FANOUT_4          (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
 #if NR_CPUS <= RCU_FANOUT_1
-#  define RCU_NUM_LVLS        1
+#  define NUM_RCU_LVLS        1
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       (NR_CPUS)
 #  define NUM_RCU_LVL_2       0
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
 #elif NR_CPUS <= RCU_FANOUT_2
-#  define RCU_NUM_LVLS        2
+#  define NUM_RCU_LVLS        2
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_2       (NR_CPUS)
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
 #elif NR_CPUS <= RCU_FANOUT_3
-#  define RCU_NUM_LVLS        3
+#  define NUM_RCU_LVLS        3
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
 #  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_3       (NR_CPUS)
 #  define NUM_RCU_LVL_4       0
 #elif NR_CPUS <= RCU_FANOUT_4
-#  define RCU_NUM_LVLS        4
+#  define NUM_RCU_LVLS        4
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
 #  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
@@ -76,36 +80,13 @@
 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
-extern int rcu_num_lvls;
-extern int rcu_num_nodes;
 /*
 * Dynticks per-CPU state.
 */
 struct rcu_dynticks {
-        long long dynticks_nesting; /* Track irq/process nesting level. */
+        int dynticks_nesting;   /* Track irq/process nesting level. */
-                                    /* Process level is worth LLONG_MAX/2. */
+        int dynticks_nmi_nesting; /* Track NMI nesting level. */
-        int dynticks_nmi_nesting;   /* Track NMI nesting level. */
+        atomic_t dynticks;      /* Even value for dynticks-idle, else odd. */
-        atomic_t dynticks;          /* Even value for idle, else odd. */
-#ifdef CONFIG_RCU_FAST_NO_HZ
-        int dyntick_drain;          /* Prepare-for-idle state variable. */
-        unsigned long dyntick_holdoff;
-                                    /* No retries for the jiffy of failure. */
-        struct timer_list idle_gp_timer;
-                                    /* Wake up CPU sleeping with callbacks. */
-        unsigned long idle_gp_timer_expires;
-                                    /* When to wake up CPU (for repost). */
-        bool idle_first_pass;       /* First pass of attempt to go idle? */
-        unsigned long nonlazy_posted;
-                                    /* # times non-lazy CBs posted to CPU. */
-        unsigned long nonlazy_posted_snap;
-                                    /* idle-period nonlazy_posted snapshot. */
-        int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
-#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-#ifdef CONFIG_RCU_USER_QS
-        bool ignore_user_qs;        /* Treat userspace as extended QS or not */
-        bool in_user;               /* Is the CPU in userland from RCU POV? */
-#endif
 };
 /* RCU's kthread states for tracing. */
@@ -200,7 +181,12 @@ struct rcu_node {
                                /* Refused to boost: not sure why, though. */
                                /*  This can happen due to race conditions. */
 #endif /* #ifdef CONFIG_RCU_BOOST */
-        raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
+        struct task_struct *node_kthread_task;
+                                /* kthread that takes care of this rcu_node */
+                                /*  structure, for example, awakening the */
+                                /*  per-CPU kthreads as needed. */
+        unsigned int node_kthread_status;
+                                /* State of node_kthread_task for tracing. */
 } ____cacheline_internodealigned_in_smp;
 /*
@@ -209,7 +195,7 @@ struct rcu_node {
 */
 #define rcu_for_each_node_breadth_first(rsp, rnp) \
        for ((rnp) = &(rsp)->node[0]; \
-             (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+             (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
 /*
 * Do a breadth-first scan of the non-leaf rcu_node structures for the
@@ -218,7 +204,7 @@ struct rcu_node {
 */
 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
        for ((rnp) = &(rsp)->node[0]; \
-             (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
+             (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
 /*
 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -227,8 +213,8 @@ struct rcu_node {
 * It is still a leaf node, even if it is also the root node.
 */
 #define rcu_for_each_leaf_node(rsp, rnp) \
-        for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+        for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
-             (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+             (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
 /* Index values for nxttail array in struct rcu_data. */
 #define RCU_DONE_TAIL           0       /* Also RCU_WAIT head. */
@@ -244,18 +230,14 @@ struct rcu_data {
                                        /*  in order to detect GP end. */
        unsigned long   gpnum;          /* Highest gp number that this CPU */
                                        /*  is aware of having started. */
-        bool            passed_quiesce; /* User-mode/idle loop etc. */
+        unsigned long   passed_quiesc_completed;
+                                        /* Value of completed at time of qs. */
+        bool            passed_quiesc;  /* User-mode/idle loop etc. */
        bool            qs_pending;     /* Core waits for quiesc state. */
        bool            beenonline;     /* CPU online at least once. */
        bool            preemptible;    /* Preemptible RCU? */
        struct rcu_node *mynode;        /* This CPU's leaf of hierarchy */
        unsigned long grpmask;          /* Mask to apply to leaf qsmask. */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
-        unsigned long   ticks_this_gp;  /* The number of scheduling-clock */
-                                        /*  ticks this CPU has handled */
-                                        /*  during and after the last grace */
-                                        /* period it is aware of. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
        /* 2) batch handling */
        /*
@@ -282,25 +264,28 @@ struct rcu_data {
         */
        struct rcu_head *nxtlist;
        struct rcu_head **nxttail[RCU_NEXT_SIZE];
-        long            qlen_lazy;      /* # of lazy queued callbacks */
+        long            qlen;           /* # of queued callbacks */
-        long            qlen;           /* # of queued callbacks, incl lazy */
        long            qlen_last_fqs_check;
                                        /* qlen at last check for QS forcing */
        unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
-        unsigned long   n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
        unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
        unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
        unsigned long   n_force_qs_snap;
                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
+#ifdef CONFIG_NO_HZ
        /* 3) dynticks interface. */
        struct rcu_dynticks *dynticks;  /* Shared per-CPU dynticks state. */
        int dynticks_snap;              /* Per-GP tracking for dynticks. */
+#endif /* #ifdef CONFIG_NO_HZ */
        /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
+#ifdef CONFIG_NO_HZ
        unsigned long dynticks_fqs;     /* Kicked due to dynticks idle. */
+#endif /* #ifdef CONFIG_NO_HZ */
        unsigned long offline_fqs;      /* Kicked due to being offline. */
+        unsigned long resched_ipi;      /* Sent a resched IPI. */
        /* 5) __rcu_pending() statistics. */
        unsigned long n_rcu_pending;    /* rcu_pending() calls since boot. */
@@ -310,36 +295,22 @@ struct rcu_data {
        unsigned long n_rp_cpu_needs_gp;
        unsigned long n_rp_gp_completed;
        unsigned long n_rp_gp_started;
+        unsigned long n_rp_need_fqs;
        unsigned long n_rp_need_nothing;
-        /* 6) _rcu_barrier() and OOM callbacks. */
-        struct rcu_head barrier_head;
-#ifdef CONFIG_RCU_FAST_NO_HZ
-        struct rcu_head oom_head;
-#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-        /* 7) Callback offloading. */
-#ifdef CONFIG_RCU_NOCB_CPU
-        struct rcu_head *nocb_head;     /* CBs waiting for kthread. */
-        struct rcu_head **nocb_tail;
-        atomic_long_t nocb_q_count;     /* # CBs waiting for kthread */
-        atomic_long_t nocb_q_count_lazy; /*  (approximate). */
-        int nocb_p_count;               /* # CBs being invoked by kthread */
-        int nocb_p_count_lazy;          /*  (approximate). */
-        wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
-        struct task_struct *nocb_kthread;
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
        int cpu;
-        struct rcu_state *rsp;
 };
-/* Values for fqs_state field in struct rcu_state. */
+/* Values for signaled field in struct rcu_state. */
 #define RCU_GP_IDLE             0       /* No grace period in progress. */
 #define RCU_GP_INIT             1       /* Grace period being initialized. */
 #define RCU_SAVE_DYNTICK        2       /* Need to scan dyntick state. */
 #define RCU_FORCE_QS            3       /* Need to force quiescent state. */
+#ifdef CONFIG_NO_HZ
 #define RCU_SIGNAL_INIT         RCU_SAVE_DYNTICK
+#else /* #ifdef CONFIG_NO_HZ */
+#define RCU_SIGNAL_INIT         RCU_FORCE_QS
+#endif /* #else #ifdef CONFIG_NO_HZ */
 #define RCU_JIFFIES_TILL_FORCE_QS        3      /* for rsp->jiffies_force_qs */
@@ -348,6 +319,12 @@ struct rcu_data {
 #else
 #define RCU_STALL_DELAY_DELTA          0
 #endif
+#define RCU_SECONDS_TILL_STALL_CHECK   (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
+                                        RCU_STALL_DELAY_DELTA)
+                                                /* for rsp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
+                                                /* for rsp->jiffies_stall */
 #define RCU_STALL_RAT_DELAY             2       /* Allow other CPUs time */
                                                /*  to take at least one */
                                                /*  scheduling clock irq */
@@ -376,65 +353,32 @@ do {									\
 */
 struct rcu_state {
        struct rcu_node node[NUM_RCU_NODES];    /* Hierarchy. */
-        struct rcu_node *level[RCU_NUM_LVLS];   /* Hierarchy levels. */
+        struct rcu_node *level[NUM_RCU_LVLS];   /* Hierarchy levels. */
        u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
-        u8 levelspread[RCU_NUM_LVLS];           /* kids/node in each level. */
+        u8 levelspread[NUM_RCU_LVLS];           /* kids/node in each level. */
        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
-        void (*call)(struct rcu_head *head,     /* call_rcu() flavor. */
-                     void (*func)(struct rcu_head *head));
-#ifdef CONFIG_RCU_NOCB_CPU
-        void (*call_remote)(struct rcu_head *head,
-                     void (*func)(struct rcu_head *head));
-                                                /* call_rcu() flavor, but for */
-                                                /*  placing on remote CPU. */
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
        /* The following fields are guarded by the root rcu_node's lock. */
-        u8      fqs_state ____cacheline_internodealigned_in_smp;
+        u8      signaled ____cacheline_internodealigned_in_smp;
                                                /* Force QS state. */
+        u8      fqs_active;                     /* force_quiescent_state() */
+                                                /*  is running. */
+        u8      fqs_need_gp;                    /* A CPU was prevented from */
+                                                /*  starting a new grace */
+                                                /*  period because */
+                                                /*  force_quiescent_state() */
+                                                /*  was running. */
        u8      boost;                          /* Subject to priority boost. */
        unsigned long gpnum;                    /* Current gp number. */
        unsigned long completed;                /* # of last completed gp. */
-        struct task_struct *gp_kthread;         /* Task for grace periods. */
-        wait_queue_head_t gp_wq;                /* Where GP task waits. */
-        int gp_flags;                           /* Commands for GP task. */
        /* End of fields guarded by root rcu_node's lock. */
-        raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
+        raw_spinlock_t onofflock;               /* exclude on/offline and */
-                                                /* Protect following fields. */
+                                                /*  starting new GP. */
-        struct rcu_head *orphan_nxtlist;        /* Orphaned callbacks that */
+        raw_spinlock_t fqslock;                 /* Only one task forcing */
-                                                /*  need a grace period. */
+                                                /*  quiescent states. */
-        struct rcu_head **orphan_nxttail;       /* Tail of above. */
-        struct rcu_head *orphan_donelist;       /* Orphaned callbacks that */
-                                                /*  are ready to invoke. */
-        struct rcu_head **orphan_donetail;      /* Tail of above. */
-        long qlen_lazy;                         /* Number of lazy callbacks. */
-        long qlen;                              /* Total number of callbacks. */
-        /* End of fields guarded by orphan_lock. */
-        struct mutex onoff_mutex;               /* Coordinate hotplug & GPs. */
-        struct mutex barrier_mutex;             /* Guards barrier fields. */
-        atomic_t barrier_cpu_count;             /* # CPUs waiting on. */
-        struct completion barrier_completion;   /* Wake at barrier end. */
-        unsigned long n_barrier_done;           /* ++ at start and end of */
-                                                /*  _rcu_barrier(). */
-        /* End of fields guarded by barrier_mutex. */
-        atomic_long_t expedited_start;          /* Starting ticket. */
-        atomic_long_t expedited_done;           /* Done ticket. */
-        atomic_long_t expedited_wrap;           /* # near-wrap incidents. */
-        atomic_long_t expedited_tryfail;        /* # acquisition failures. */
-        atomic_long_t expedited_workdone1;      /* # done by others #1. */
-        atomic_long_t expedited_workdone2;      /* # done by others #2. */
-        atomic_long_t expedited_normal;         /* # fallbacks to normal. */
-        atomic_long_t expedited_stoppedcpus;    /* # successful stop_cpus. */
-        atomic_long_t expedited_done_tries;     /* # tries to update _done. */
-        atomic_long_t expedited_done_lost;      /* # times beaten to _done. */
-        atomic_long_t expedited_done_exit;      /* # times exited _done loop. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
                                                /*  force_quiescent_state(). */
        unsigned long n_force_qs;               /* Number of calls to */
@@ -450,19 +394,8 @@ struct rcu_state {
        unsigned long gp_max;                   /* Maximum GP duration in */
                                                /*  jiffies. */
        char *name;                             /* Name of structure. */
-        struct list_head flavors;               /* List of RCU flavors. */
 };
-/* Values for rcu_state structure's gp_flags field. */
-#define RCU_GP_FLAG_INIT 0x1    /* Need grace-period initialization. */
-#define RCU_GP_FLAG_FQS  0x2    /* Need grace-period quiescent-state forcing. */
-extern struct list_head rcu_struct_flavors;
-/* Sequence through rcu_state structures for each RCU flavor. */
-#define for_each_rcu_flavor(rsp) \
-        list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
 /* Return values for rcu_preempt_offline_tasks(). */
 #define RCU_OFL_TASKS_NORM_GP   0x1             /* Tasks blocking normal */
@@ -484,13 +417,6 @@ extern struct rcu_state rcu_preempt_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-#ifdef CONFIG_RCU_BOOST
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DECLARE_PER_CPU(char, rcu_cpu_has_work);
-#endif /* #ifdef CONFIG_RCU_BOOST */
 #ifndef RCU_TREE_NONCORE
 /* Forward declarations for rcutree_plugin.h */
@@ -501,67 +427,44 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
                                      unsigned long flags);
+static void rcu_stop_cpu_kthread(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
-static int rcu_print_task_stall(struct rcu_node *rnp);
+static void rcu_print_task_stall(struct rcu_node *rnp);
+static void rcu_preempt_stall_reset(void);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
                                     struct rcu_node *rnp,
                                     struct rcu_data *rdp);
+static void rcu_preempt_offline_cpu(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_preempt_check_callbacks(int cpu);
+static void rcu_preempt_process_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
-                               bool wake);
 #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
+static int rcu_preempt_pending(int cpu);
+static int rcu_preempt_needs_cpu(int cpu);
+static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
+static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
+static void rcu_needs_cpu_flush(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
 static void invoke_rcu_callbacks_kthread(void);
-static bool rcu_is_callbacks_kthread(void);
 #ifdef CONFIG_RCU_BOOST
 static void rcu_preempt_do_callbacks(void);
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
+                                          cpumask_var_t cm);
 static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
-                                                 struct rcu_node *rnp);
+                                                 struct rcu_node *rnp,
+                                                 int rnp_index);
+static void invoke_rcu_node_kthread(struct rcu_node *rnp);
+static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
 #endif /* #ifdef CONFIG_RCU_BOOST */
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
 static void __cpuinit rcu_prepare_kthreads(int cpu);
-static void rcu_prepare_for_idle_init(int cpu);
-static void rcu_cleanup_after_idle(int cpu);
-static void rcu_prepare_for_idle(int cpu);
-static void rcu_idle_count_callbacks_posted(void);
-static void print_cpu_stall_info_begin(void);
-static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
-static void print_cpu_stall_info_end(void);
-static void zero_cpu_stall_ticks(struct rcu_data *rdp);
-static void increment_cpu_stall_ticks(void);
-static bool is_nocb_cpu(int cpu);
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
-                            bool lazy);
-static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
-                                      struct rcu_data *rdp);
-static bool nocb_cpu_expendable(int cpu);
-static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
-static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void init_nocb_callback_list(struct rcu_data *rdp);
-static void __init rcu_init_nocb(void);
 #endif /* #ifndef RCU_TREE_NONCORE */
-#ifdef CONFIG_RCU_TRACE
-#ifdef CONFIG_RCU_NOCB_CPU
-/* Sum up queue lengths for tracing. */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
-        *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
-        *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
-}
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
-        *ql = 0;
-        *qll = 0;
-}
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
-#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index f6e5ec2932b..8aafbb80b8b 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,25 +25,7 @@
 */
 #include <linux/delay.h>
-#include <linux/gfp.h>
+#include <linux/stop_machine.h>
-#include <linux/oom.h>
-#include <linux/smpboot.h>
-#define RCU_KTHREAD_PRIO 1
-#ifdef CONFIG_RCU_BOOST
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-#else
-#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
-#endif
-#ifdef CONFIG_RCU_NOCB_CPU
-static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
-static bool have_rcu_nocb_mask;     /* Was rcu_nocb_mask allocated? */
-static bool rcu_nocb_poll;          /* Offload kthread are to poll. */
-module_param(rcu_nocb_poll, bool, 0444);
-static char __initdata nocb_buf[NR_CPUS * 5];
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 /*
 * Check the RCU kernel configuration parameters and print informative
@@ -73,39 +55,20 @@ static void __init rcu_bootup_announce_oddness(void)
        printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
 #endif
 #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
-        printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n");
+        printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
-#endif
-#if defined(CONFIG_RCU_CPU_STALL_INFO)
-        printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
 #endif
 #if NUM_RCU_LVL_4 != 0
-        printk(KERN_INFO "\tFour-level hierarchy is enabled.\n");
+        printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
 #endif
-        if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
-                printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
-        if (nr_cpu_ids != NR_CPUS)
-                printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
-#ifdef CONFIG_RCU_NOCB_CPU
-        if (have_rcu_nocb_mask) {
-                if (cpumask_test_cpu(0, rcu_nocb_mask)) {
-                        cpumask_clear_cpu(0, rcu_nocb_mask);
-                        pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
-                }
-                cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
-                pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
-                if (rcu_nocb_poll)
-                        pr_info("\tExperimental polled no-CBs CPUs.\n");
-        }
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 }
 #ifdef CONFIG_TREE_PREEMPT_RCU
-struct rcu_state rcu_preempt_state =
+struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
-        RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static struct rcu_state *rcu_state = &rcu_preempt_state;
+static void rcu_read_unlock_special(struct task_struct *t);
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 /*
@@ -141,7 +104,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
 */
 void rcu_force_quiescent_state(void)
 {
-        force_quiescent_state(&rcu_preempt_state);
+        force_quiescent_state(&rcu_preempt_state, 0);
 }
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
@@ -159,9 +122,9 @@ static void rcu_preempt_qs(int cpu)
 {
        struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
-        if (rdp->passed_quiesce == 0)
+        rdp->passed_quiesc_completed = rdp->gpnum - 1;
-                trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
+        barrier();
-        rdp->passed_quiesce = 1;
+        rdp->passed_quiesc = 1;
        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 }
@@ -227,11 +190,6 @@ static void rcu_preempt_note_context_switch(int cpu)
                        if (rnp->qsmask & rdp->grpmask)
                                rnp->gp_tasks = &t->rcu_node_entry;
                }
-                trace_rcu_preempt_task(rdp->rsp->name,
-                                       t->pid,
-                                       (rnp->qsmask & rdp->grpmask)
-                                       ? rnp->gpnum
-                                       : rnp->gpnum + 1);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        } else if (t->rcu_read_lock_nesting < 0 &&
                   t->rcu_read_unlock_special) {
@@ -258,6 +216,18 @@ static void rcu_preempt_note_context_switch(int cpu)
 }
 /*
+ * Tree-preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+        current->rcu_read_lock_nesting++;
+        barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+/*
 * Check for preempted RCU readers blocking the current grace period
 * for the specified rcu_node structure.  If the caller needs a reliable
 * answer, it must hold the rcu_node's ->lock.
@@ -323,16 +293,12 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
 * notify RCU core processing or task having blocked during the RCU
 * read-side critical section.
 */
-void rcu_read_unlock_special(struct task_struct *t)
+static noinline void rcu_read_unlock_special(struct task_struct *t)
 {
        int empty;
        int empty_exp;
-        int empty_exp_now;
        unsigned long flags;
        struct list_head *np;
-#ifdef CONFIG_RCU_BOOST
-        struct rt_mutex *rbmp = NULL;
-#endif /* #ifdef CONFIG_RCU_BOOST */
        struct rcu_node *rnp;
        int special;
@@ -378,9 +344,6 @@ void rcu_read_unlock_special(struct task_struct *t)
                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
                np = rcu_next_node_entry(t, rnp);
                list_del_init(&t->rcu_node_entry);
-                t->rcu_blocked_node = NULL;
-                trace_rcu_unlock_preempted_task("rcu_preempt",
-                                                rnp->gpnum, t->pid);
                if (&t->rcu_node_entry == rnp->gp_tasks)
                        rnp->gp_tasks = np;
                if (&t->rcu_node_entry == rnp->exp_tasks)
@@ -388,50 +351,75 @@ void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
                if (&t->rcu_node_entry == rnp->boost_tasks)
                        rnp->boost_tasks = np;
-                /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
+                /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
-                if (t->rcu_boost_mutex) {
+                if (t->rcu_boosted) {
-                        rbmp = t->rcu_boost_mutex;
+                        special |= RCU_READ_UNLOCK_BOOSTED;
-                        t->rcu_boost_mutex = NULL;
+                        t->rcu_boosted = 0;
                }
 #endif /* #ifdef CONFIG_RCU_BOOST */
+                t->rcu_blocked_node = NULL;
                /*
                 * If this was the last task on the current list, and if
                 * we aren't waiting on any CPUs, report the quiescent state.
-                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
+                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
-                 * so we must take a snapshot of the expedited state.
                 */
-                empty_exp_now = !rcu_preempted_readers_exp(rnp);
+                if (empty)
-                if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
-                        trace_rcu_quiescent_state_report("preempt_rcu",
-                                                         rnp->gpnum,
-                                                         0, rnp->qsmask,
-                                                         rnp->level,
-                                                         rnp->grplo,
-                                                         rnp->grphi,
-                                                         !!rnp->gp_tasks);
-                        rcu_report_unblock_qs_rnp(rnp, flags);
-                } else {
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                }
+                else
+                        rcu_report_unblock_qs_rnp(rnp, flags);
 #ifdef CONFIG_RCU_BOOST
                /* Unboost if we were boosted. */
-                if (rbmp)
+                if (special & RCU_READ_UNLOCK_BOOSTED) {
-                        rt_mutex_unlock(rbmp);
+                        rt_mutex_unlock(t->rcu_boost_mutex);
+                        t->rcu_boost_mutex = NULL;
+                }
 #endif /* #ifdef CONFIG_RCU_BOOST */
                /*
                 * If this was the last task on the expedited lists,
                 * then we need to report up the rcu_node hierarchy.
                 */
-                if (!empty_exp && empty_exp_now)
+                if (!empty_exp && !rcu_preempted_readers_exp(rnp))
-                        rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
+                        rcu_report_exp_rnp(&rcu_preempt_state, rnp);
        } else {
                local_irq_restore(flags);
        }
 }
+/*
+ * Tree-preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+        struct task_struct *t = current;
+        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
+        if (t->rcu_read_lock_nesting != 1)
+                --t->rcu_read_lock_nesting;
+        else {
+                t->rcu_read_lock_nesting = INT_MIN;
+                barrier();  /* assign before ->rcu_read_unlock_special load */
+                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                        rcu_read_unlock_special(t);
+                barrier();  /* ->rcu_read_unlock_special load before assign */
+                t->rcu_read_lock_nesting = 0;
+        }
+#ifdef CONFIG_PROVE_LOCKING
+        {
+                int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+                WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+        }
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 #ifdef CONFIG_RCU_CPU_STALL_VERBOSE
 /*
@@ -443,11 +431,9 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
        unsigned long flags;
        struct task_struct *t;
-        raw_spin_lock_irqsave(&rnp->lock, flags);
+        if (!rcu_preempt_blocked_readers_cgp(rnp))
-        if (!rcu_preempt_blocked_readers_cgp(rnp)) {
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
-        }
+        raw_spin_lock_irqsave(&rnp->lock, flags);
        t = list_entry(rnp->gp_tasks,
                       struct task_struct, rcu_node_entry);
        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
@@ -476,51 +462,30 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
-        printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
-               rnp->level, rnp->grplo, rnp->grphi);
-}
-static void rcu_print_task_stall_end(void)
-{
-        printk(KERN_CONT "\n");
-}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
-}
-static void rcu_print_task_stall_end(void)
-{
-}
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
 /*
 * Scan the current list of tasks blocked within RCU read-side critical
 * sections, printing out the tid of each.
 */
-static int rcu_print_task_stall(struct rcu_node *rnp)
+static void rcu_print_task_stall(struct rcu_node *rnp)
 {
        struct task_struct *t;
-        int ndetected = 0;
        if (!rcu_preempt_blocked_readers_cgp(rnp))
-                return 0;
+                return;
-        rcu_print_task_stall_begin(rnp);
        t = list_entry(rnp->gp_tasks,
                       struct task_struct, rcu_node_entry);
-        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
-                printk(KERN_CONT " P%d", t->pid);
+                printk(" P%d", t->pid);
-                ndetected++;
+}
-        }
-        rcu_print_task_stall_end();
+/*
-        return ndetected;
+ * Suppress preemptible RCU's CPU stall warnings by pushing the
+ * time of the next stall-warning message comfortably far into the
+ * future.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+        rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
 }
 /*
@@ -584,7 +549,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
         * absolutely necessary, but this is a good performance/complexity
         * tradeoff.
         */
-        if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
+        if (rcu_preempt_blocked_readers_cgp(rnp))
                retval |= RCU_OFL_TASKS_NORM_GP;
        if (rcu_preempted_readers_exp(rnp))
                retval |= RCU_OFL_TASKS_EXP_GP;
@@ -607,26 +572,28 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
                raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
        }
-        rnp->gp_tasks = NULL;
-        rnp->exp_tasks = NULL;
 #ifdef CONFIG_RCU_BOOST
-        rnp->boost_tasks = NULL;
+        /* In case root is being boosted and leaf is not. */
-        /*
-         * In case root is being boosted and leaf was not.  Make sure
-         * that we boost the tasks blocking the current grace period
-         * in this case.
-         */
        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
        if (rnp_root->boost_tasks != NULL &&
-            rnp_root->boost_tasks != rnp_root->gp_tasks &&
+            rnp_root->boost_tasks != rnp_root->gp_tasks)
-            rnp_root->boost_tasks != rnp_root->exp_tasks)
                rnp_root->boost_tasks = rnp_root->gp_tasks;
        raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
 #endif /* #ifdef CONFIG_RCU_BOOST */
+        rnp->gp_tasks = NULL;
+        rnp->exp_tasks = NULL;
        return retval;
 }
+/*
+ * Do CPU-offline processing for preemptible RCU.
+ */
+static void rcu_preempt_offline_cpu(int cpu)
+{
+        __rcu_offline_cpu(cpu, &rcu_preempt_state);
+}
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
@@ -649,6 +616,15 @@ static void rcu_preempt_check_callbacks(int cpu)
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
 }
+/*
+ * Process callbacks for preemptible RCU.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+        __rcu_process_callbacks(&rcu_preempt_state,
+                                &__get_cpu_var(rcu_preempt_data));
+}
 #ifdef CONFIG_RCU_BOOST
 static void rcu_preempt_do_callbacks(void)
@@ -663,24 +639,10 @@ static void rcu_preempt_do_callbacks(void)
 */
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-        __call_rcu(head, func, &rcu_preempt_state, -1, 0);
+        __call_rcu(head, func, &rcu_preempt_state);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
-/*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks.  Until then, this
- * function may only be called from __kfree_rcu().
- */
-void kfree_call_rcu(struct rcu_head *head,
-                    void (*func)(struct rcu_head *rcu))
-{
-        __call_rcu(head, func, &rcu_preempt_state, -1, 1);
-}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
 /**
 * synchronize_rcu - wait until a grace period has elapsed.
 *
@@ -691,27 +653,26 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu);
 * concurrently with new RCU read-side critical sections that began while
 * synchronize_rcu() was waiting.  RCU read-side critical sections are
 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
- *
- * See the description of synchronize_sched() for more detailed information
- * on memory ordering guarantees.
 */
 void synchronize_rcu(void)
 {
-        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+        struct rcu_synchronize rcu;
-                           !lock_is_held(&rcu_lock_map) &&
-                           !lock_is_held(&rcu_sched_lock_map),
-                           "Illegal synchronize_rcu() in RCU read-side critical section");
        if (!rcu_scheduler_active)
                return;
-        if (rcu_expedited)
-                synchronize_rcu_expedited();
+        init_rcu_head_on_stack(&rcu.head);
-        else
+        init_completion(&rcu.completion);
-                wait_rcu_gp(call_rcu);
+        /* Will wake me after RCU finished. */
+        call_rcu(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-static unsigned long sync_rcu_preempt_exp_count;
+static long sync_rcu_preempt_exp_count;
 static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
 /*
@@ -748,13 +709,9 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 * recursively up the tree.  (Calm down, calm down, we do the recursion
 * iteratively!)
 *
- * Most callers will set the "wake" flag, but the task initiating the
- * expedited grace period need not wake itself.
- *
 * Caller must hold sync_rcu_preempt_exp_mutex.
 */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
-                               bool wake)
 {
        unsigned long flags;
        unsigned long mask;
@@ -767,8 +724,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
                }
                if (rnp->parent == NULL) {
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                        if (wake)
+                        wake_up(&sync_rcu_preempt_exp_wq);
-                                wake_up(&sync_rcu_preempt_exp_wq);
                        break;
                }
                mask = rnp->grpmask;
@@ -784,8 +740,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 * grace period for the specified rcu_node structure.  If there are no such
 * tasks, report it up the rcu_node hierarchy.
 *
- * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
+ * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
- * CPU hotplug operations.
 */
 static void
 sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -794,40 +749,28 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
        int must_wait = 0;
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        if (list_empty(&rnp->blkd_tasks)) {
+        if (list_empty(&rnp->blkd_tasks))
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        } else {
+        else {
                rnp->exp_tasks = rnp->blkd_tasks.next;
                rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
                must_wait = 1;
        }
        if (!must_wait)
-                rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
+                rcu_report_exp_rnp(rsp, rnp);
 }
-/**
+/*
- * synchronize_rcu_expedited - Brute-force RCU grace period
+ * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
- *
+ * is to invoke synchronize_sched_expedited() to push all the tasks to
- * Wait for an RCU-preempt grace period, but expedite it.  The basic
+ * the ->blkd_tasks lists and wait for this list to drain.
- * idea is to invoke synchronize_sched_expedited() to push all the tasks to
- * the ->blkd_tasks lists and wait for this list to drain.  This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code.
- * In fact, if you are using synchronize_rcu_expedited() in a loop,
- * please restructure your code to batch your updates, and then Use a
- * single synchronize_rcu() instead.
- *
- * Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
- * to call this function from a CPU-hotplug notifier.  Failing to observe
- * these restriction will result in deadlock.
 */
 void synchronize_rcu_expedited(void)
 {
        unsigned long flags;
        struct rcu_node *rnp;
        struct rcu_state *rsp = &rcu_preempt_state;
-        unsigned long snap;
+        long snap;
        int trycount = 0;
        smp_mb(); /* Caller's modifications seen first by other CPUs. */
@@ -835,47 +778,33 @@ void synchronize_rcu_expedited(void)
        smp_mb(); /* Above access cannot bleed into critical section. */
        /*
-         * Block CPU-hotplug operations.  This means that any CPU-hotplug
-         * operation that finds an rcu_node structure with tasks in the
-         * process of being boosted will know that all tasks blocking
-         * this expedited grace period will already be in the process of
-         * being boosted.  This simplifies the process of moving tasks
-         * from leaf to root rcu_node structures.
-         */
-        get_online_cpus();
-        /*
         * Acquire lock, falling back to synchronize_rcu() if too many
         * lock-acquisition failures.  Of course, if someone does the
         * expedited grace period for us, just leave.
         */
        while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
-                if (ULONG_CMP_LT(snap,
+                if (trycount++ < 10)
-                    ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
-                        put_online_cpus();
-                        goto mb_ret; /* Others did our work for us. */
-                }
-                if (trycount++ < 10) {
                        udelay(trycount * num_online_cpus());
-                } else {
+                else {
-                        put_online_cpus();
+                        synchronize_rcu();
-                        wait_rcu_gp(call_rcu);
                        return;
                }
+                if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
+                        goto mb_ret; /* Others did our work for us. */
        }
-        if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+        if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
-                put_online_cpus();
                goto unlock_mb_ret; /* Others did our work for us. */
-        }
        /* force all RCU readers onto ->blkd_tasks lists. */
        synchronize_sched_expedited();
+        raw_spin_lock_irqsave(&rsp->onofflock, flags);
        /* Initialize ->expmask for all non-leaf rcu_node structures. */
        rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
-                raw_spin_lock_irqsave(&rnp->lock, flags);
+                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
                rnp->expmask = rnp->qsmaskinit;
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
        }
        /* Snapshot current state of ->blkd_tasks lists. */
@@ -884,7 +813,7 @@ void synchronize_rcu_expedited(void)
        if (NUM_RCU_NODES > 1)
                sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
-        put_online_cpus();
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
        /* Wait for snapshotted ->blkd_tasks lists to drain. */
        rnp = rcu_get_root(rsp);
@@ -901,21 +830,50 @@ mb_ret:
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+/*
+ * Check to see if there is any immediate preemptible-RCU-related work
+ * to be done.
+ */
+static int rcu_preempt_pending(int cpu)
+{
+        return __rcu_pending(&rcu_preempt_state,
+                             &per_cpu(rcu_preempt_data, cpu));
+}
+/*
+ * Does preemptible RCU need the CPU to stay out of dynticks mode?
+ */
+static int rcu_preempt_needs_cpu(int cpu)
+{
+        return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
+}
 /**
 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
- *
- * Note that this primitive does not necessarily wait for an RCU grace period
- * to complete.  For example, if there are no RCU callbacks queued anywhere
- * in the system, then rcu_barrier() is within its rights to return
- * immediately, without waiting for anything, much less an RCU grace period.
 */
 void rcu_barrier(void)
 {
-        _rcu_barrier(&rcu_preempt_state);
+        _rcu_barrier(&rcu_preempt_state, call_rcu);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
+ * Initialize preemptible RCU's per-CPU data.
+ */
+static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
+{
+        rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
+}
+/*
+ * Move preemptible RCU's callbacks from dying CPU to other online CPU.
+ */
+static void rcu_preempt_send_cbs_to_online(void)
+{
+        rcu_send_cbs_to_online(&rcu_preempt_state);
+}
+/*
 * Initialize preemptible RCU's state structures.
 */
 static void __init __rcu_init_preempt(void)
@@ -923,6 +881,22 @@ static void __init __rcu_init_preempt(void)
        rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 }
+/*
+ * Check for a task exiting while in a preemptible-RCU read-side
+ * critical section, clean up if so.  No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+        struct task_struct *t = current;
+        if (t->rcu_read_lock_nesting == 0)
+                return;
+        t->rcu_read_lock_nesting = 1;
+        __rcu_read_unlock();
+}
 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -994,9 +968,16 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 * Because preemptible RCU does not exist, we never have to check for
 * tasks blocked within RCU read-side critical sections.
 */
-static int rcu_print_task_stall(struct rcu_node *rnp)
+static void rcu_print_task_stall(struct rcu_node *rnp)
+{
+}
+/*
+ * Because preemptible RCU does not exist, there is no need to suppress
+ * its CPU stall warnings.
+ */
+static void rcu_preempt_stall_reset(void)
 {
-        return 0;
 }
 /*
@@ -1024,6 +1005,14 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
        return 0;
 }
+/*
+ * Because preemptible RCU does not exist, it never needs CPU-offline
+ * processing.
+ */
+static void rcu_preempt_offline_cpu(int cpu)
+{
+}
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
@@ -1035,20 +1024,12 @@ static void rcu_preempt_check_callbacks(int cpu)
 }
 /*
- * Queue an RCU callback for lazy invocation after a grace period.
+ * Because preemptible RCU does not exist, it never has any callbacks
- * This will likely be later named something like "call_rcu_lazy()",
+ * to process.
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks.  Until then, this
- * function may only be called from __kfree_rcu().
- *
- * Because there is no preemptible RCU, we use RCU-sched instead.
 */
-void kfree_call_rcu(struct rcu_head *head,
+static void rcu_preempt_process_callbacks(void)
-                    void (*func)(struct rcu_head *rcu))
 {
-        __call_rcu(head, func, &rcu_sched_state, -1, 1);
 }
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
 /*
 * Wait for an rcu-preempt grace period, but make it happen quickly.
@@ -1067,14 +1048,30 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 * report on tasks preempted in RCU read-side critical sections during
 * expedited RCU grace periods.
 */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
-                               bool wake)
 {
+        return;
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
+ * Because preemptible RCU does not exist, it never has any work to do.
+ */
+static int rcu_preempt_pending(int cpu)
+{
+        return 0;
+}
+/*
+ * Because preemptible RCU does not exist, it never needs any CPU.
+ */
+static int rcu_preempt_needs_cpu(int cpu)
+{
+        return 0;
+}
+/*
 * Because preemptible RCU does not exist, rcu_barrier() is just
 * another name for rcu_barrier_sched().
 */
@@ -1085,6 +1082,21 @@ void rcu_barrier(void)
 EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
+ * Because preemptible RCU does not exist, there is no per-CPU
+ * data to initialize.
+ */
+static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
+{
+}
+/*
+ * Because there is no preemptible RCU, there are no callbacks to move.
+ */
+static void rcu_preempt_send_cbs_to_online(void)
+{
+}
+/*
 * Because preemptible RCU does not exist, it need not be initialized.
 */
 static void __init __rcu_init_preempt(void)
@@ -1124,16 +1136,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
-static void rcu_wake_cond(struct task_struct *t, int status)
-{
-        /*
-         * If the thread is yielding, only wake it when this
-         * is invoked from idle
-         */
-        if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
-                wake_up_process(t);
-}
 /*
 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1197,12 +1199,23 @@ static int rcu_boost(struct rcu_node *rnp)
        t = container_of(tb, struct task_struct, rcu_node_entry);
        rt_mutex_init_proxy_locked(&mtx, t);
        t->rcu_boost_mutex = &mtx;
+        t->rcu_boosted = 1;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
-        return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
+        return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
-               ACCESS_ONCE(rnp->boost_tasks) != NULL;
+}
+/*
+ * Timer handler to initiate waking up of boost kthreads that
+ * have yielded the CPU due to excessive numbers of tasks to
+ * boost.  We wake up the per-rcu_node kthread, which in turn
+ * will wake up the booster kthread.
+ */
+static void rcu_boost_kthread_timer(unsigned long arg)
+{
+        invoke_rcu_node_kthread((struct rcu_node *)arg);
 }
 /*
@@ -1215,12 +1228,9 @@ static int rcu_boost_kthread(void *arg)
        int spincnt = 0;
        int more2boost;
-        trace_rcu_utilization("Start boost kthread@init");
        for (;;) {
                rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
-                trace_rcu_utilization("End boost kthread@rcu_wait");
                rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
-                trace_rcu_utilization("Start boost kthread@rcu_wait");
                rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
                more2boost = rcu_boost(rnp);
                if (more2boost)
@@ -1228,15 +1238,11 @@ static int rcu_boost_kthread(void *arg)
                else
                        spincnt = 0;
                if (spincnt > 10) {
-                        rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
+                        rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
-                        trace_rcu_utilization("End boost kthread@rcu_yield");
-                        schedule_timeout_interruptible(2);
-                        trace_rcu_utilization("Start boost kthread@rcu_yield");
                        spincnt = 0;
                }
        }
        /* NOTREACHED */
-        trace_rcu_utilization("End boost kthread@notreached");
        return 0;
 }
@@ -1246,9 +1252,9 @@ static int rcu_boost_kthread(void *arg)
 * kthread to start boosting them.  If there is an expedited grace
 * period in progress, it is always time to boost.
 *
- * The caller must hold rnp->lock, which this function releases.
+ * The caller must hold rnp->lock, which this function releases,
- * The ->boost_kthread_task is immortal, so we don't need to worry
+ * but irqs remain disabled.  The ->boost_kthread_task is immortal,
- * about it going away.
+ * so we don't need to worry about it going away.
 */
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 {
@@ -1268,8 +1274,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
                        rnp->boost_tasks = rnp->gp_tasks;
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                t = rnp->boost_kthread_task;
-                if (t)
+                if (t != NULL)
-                        rcu_wake_cond(t, rnp->boost_kthread_status);
+                        wake_up_process(t);
        } else {
                rcu_initiate_boost_trace(rnp);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1285,21 +1291,27 @@ static void invoke_rcu_callbacks_kthread(void)
        local_irq_save(flags);
        __this_cpu_write(rcu_cpu_has_work, 1);
-        if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
+        if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
-            current != __this_cpu_read(rcu_cpu_kthread_task)) {
+                local_irq_restore(flags);
-                rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
+                return;
-                              __this_cpu_read(rcu_cpu_kthread_status));
        }
+        wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
        local_irq_restore(flags);
 }
 /*
- * Is the current CPU running the RCU-callbacks kthread?
+ * Set the affinity of the boost kthread.  The CPU-hotplug locks are
- * Caller must have preemption disabled.
+ * held, so no one should be messing with the existence of the boost
+ * kthread.
 */
-static bool rcu_is_callbacks_kthread(void)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
+                                          cpumask_var_t cm)
 {
-        return __get_cpu_var(rcu_cpu_kthread_task) == current;
+        struct task_struct *t;
+        t = rnp->boost_kthread_task;
+        if (t != NULL)
+                set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
 }
 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
@@ -1318,35 +1330,50 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 * Returns zero if all is well, a negated errno otherwise.
 */
 static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
-                                                 struct rcu_node *rnp)
+                                                 struct rcu_node *rnp,
+                                                 int rnp_index)
 {
-        int rnp_index = rnp - &rsp->node[0];
        unsigned long flags;
        struct sched_param sp;
        struct task_struct *t;
        if (&rcu_preempt_state != rsp)
                return 0;
-        if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
-                return 0;
        rsp->boost = 1;
        if (rnp->boost_kthread_task != NULL)
                return 0;
        t = kthread_create(rcu_boost_kthread, (void *)rnp,
-                           "rcub/%d", rnp_index);
+                           "rcub%d", rnp_index);
        if (IS_ERR(t))
                return PTR_ERR(t);
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rnp->boost_kthread_task = t;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        sp.sched_priority = RCU_BOOST_PRIO;
+        sp.sched_priority = RCU_KTHREAD_PRIO;
        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
        wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
        return 0;
 }
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Stop the RCU's per-CPU kthread when its CPU goes offline,.
+ */
+static void rcu_stop_cpu_kthread(int cpu)
+{
+        struct task_struct *t;
+        /* Stop the CPU's kthread. */
+        t = per_cpu(rcu_cpu_kthread_task, cpu);
+        if (t != NULL) {
+                per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
+                kthread_stop(t);
+        }
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_kthread_do_work(void)
 {
        rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
@@ -1354,58 +1381,239 @@ static void rcu_kthread_do_work(void)
        rcu_preempt_do_callbacks();
 }
-static void rcu_cpu_kthread_setup(unsigned int cpu)
+/*
+ * Wake up the specified per-rcu_node-structure kthread.
+ * Because the per-rcu_node kthreads are immortal, we don't need
+ * to do anything to keep them alive.
+ */
+static void invoke_rcu_node_kthread(struct rcu_node *rnp)
+{
+        struct task_struct *t;
+        t = rnp->node_kthread_task;
+        if (t != NULL)
+                wake_up_process(t);
+}
+/*
+ * Set the specified CPU's kthread to run RT or not, as specified by
+ * the to_rt argument.  The CPU-hotplug locks are held, so the task
+ * is not going away.
+ */
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
 {
+        int policy;
        struct sched_param sp;
+        struct task_struct *t;
-        sp.sched_priority = RCU_KTHREAD_PRIO;
+        t = per_cpu(rcu_cpu_kthread_task, cpu);
-        sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+        if (t == NULL)
+                return;
+        if (to_rt) {
+                policy = SCHED_FIFO;
+                sp.sched_priority = RCU_KTHREAD_PRIO;
+        } else {
+                policy = SCHED_NORMAL;
+                sp.sched_priority = 0;
+        }
+        sched_setscheduler_nocheck(t, policy, &sp);
 }
-static void rcu_cpu_kthread_park(unsigned int cpu)
+/*
+ * Timer handler to initiate the waking up of per-CPU kthreads that
+ * have yielded the CPU due to excess numbers of RCU callbacks.
+ * We wake up the per-rcu_node kthread, which in turn will wake up
+ * the booster kthread.
+ */
+static void rcu_cpu_kthread_timer(unsigned long arg)
 {
-        per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
+        struct rcu_node *rnp = rdp->mynode;
+        atomic_or(rdp->grpmask, &rnp->wakemask);
+        invoke_rcu_node_kthread(rnp);
 }
-static int rcu_cpu_kthread_should_run(unsigned int cpu)
+/*
+ * Drop to non-real-time priority and yield, but only after posting a
+ * timer that will cause us to regain our real-time priority if we
+ * remain preempted.  Either way, we restore our real-time priority
+ * before returning.
+ */
+static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
 {
-        return __get_cpu_var(rcu_cpu_has_work);
+        struct sched_param sp;
+        struct timer_list yield_timer;
+        setup_timer_on_stack(&yield_timer, f, arg);
+        mod_timer(&yield_timer, jiffies + 2);
+        sp.sched_priority = 0;
+        sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
+        set_user_nice(current, 19);
+        schedule();
+        sp.sched_priority = RCU_KTHREAD_PRIO;
+        sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+        del_timer(&yield_timer);
+}
+/*
+ * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
+ * This can happen while the corresponding CPU is either coming online
+ * or going offline.  We cannot wait until the CPU is fully online
+ * before starting the kthread, because the various notifier functions
+ * can wait for RCU grace periods.  So we park rcu_cpu_kthread() until
+ * the corresponding CPU is online.
+ *
+ * Return 1 if the kthread needs to stop, 0 otherwise.
+ *
+ * Caller must disable bh.  This function can momentarily enable it.
+ */
+static int rcu_cpu_kthread_should_stop(int cpu)
+{
+        while (cpu_is_offline(cpu) ||
+               !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
+               smp_processor_id() != cpu) {
+                if (kthread_should_stop())
+                        return 1;
+                per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+                per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
+                local_bh_enable();
+                schedule_timeout_uninterruptible(1);
+                if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
+                        set_cpus_allowed_ptr(current, cpumask_of(cpu));
+                local_bh_disable();
+        }
+        per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+        return 0;
 }
 /*
 * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
- * RCU softirq used in flavors and configurations of RCU that do not
+ * earlier RCU softirq.
- * support RCU priority boosting.
 */
-static void rcu_cpu_kthread(unsigned int cpu)
+static int rcu_cpu_kthread(void *arg)
 {
-        unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
+        int cpu = (int)(long)arg;
-        char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
+        unsigned long flags;
-        int spincnt;
+        int spincnt = 0;
+        unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
+        char work;
+        char *workp = &per_cpu(rcu_cpu_has_work, cpu);
-        for (spincnt = 0; spincnt < 10; spincnt++) {
+        for (;;) {
-                trace_rcu_utilization("Start CPU kthread@rcu_wait");
+                *statusp = RCU_KTHREAD_WAITING;
+                rcu_wait(*workp != 0 || kthread_should_stop());
                local_bh_disable();
+                if (rcu_cpu_kthread_should_stop(cpu)) {
+                        local_bh_enable();
+                        break;
+                }
                *statusp = RCU_KTHREAD_RUNNING;
-                this_cpu_inc(rcu_cpu_kthread_loops);
+                per_cpu(rcu_cpu_kthread_loops, cpu)++;
-                local_irq_disable();
+                local_irq_save(flags);
                work = *workp;
                *workp = 0;
-                local_irq_enable();
+                local_irq_restore(flags);
                if (work)
                        rcu_kthread_do_work();
                local_bh_enable();
-                if (*workp == 0) {
+                if (*workp != 0)
-                        trace_rcu_utilization("End CPU kthread@rcu_wait");
+                        spincnt++;
-                        *statusp = RCU_KTHREAD_WAITING;
+                else
-                        return;
+                        spincnt = 0;
+                if (spincnt > 10) {
+                        *statusp = RCU_KTHREAD_YIELDING;
+                        rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+                        spincnt = 0;
                }
        }
-        *statusp = RCU_KTHREAD_YIELDING;
+        *statusp = RCU_KTHREAD_STOPPED;
-        trace_rcu_utilization("Start CPU kthread@rcu_yield");
+        return 0;
-        schedule_timeout_interruptible(2);
+}
-        trace_rcu_utilization("End CPU kthread@rcu_yield");
-        *statusp = RCU_KTHREAD_WAITING;
+/*
+ * Spawn a per-CPU kthread, setting up affinity and priority.
+ * Because the CPU hotplug lock is held, no other CPU will be attempting
+ * to manipulate rcu_cpu_kthread_task.  There might be another CPU
+ * attempting to access it during boot, but the locking in kthread_bind()
+ * will enforce sufficient ordering.
+ *
+ * Please note that we cannot simply refuse to wake up the per-CPU
+ * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
+ * which can result in softlockup complaints if the task ends up being
+ * idle for more than a couple of minutes.
+ *
+ * However, please note also that we cannot bind the per-CPU kthread to its
+ * CPU until that CPU is fully online.  We also cannot wait until the
+ * CPU is fully online before we create its per-CPU kthread, as this would
+ * deadlock the system when CPU notifiers tried waiting for grace
+ * periods.  So we bind the per-CPU kthread to its CPU only if the CPU
+ * is online.  If its CPU is not yet fully online, then the code in
+ * rcu_cpu_kthread() will wait until it is fully online, and then do
+ * the binding.
+ */
+static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
+{
+        struct sched_param sp;
+        struct task_struct *t;
+        if (!rcu_scheduler_fully_active ||
+            per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
+                return 0;
+        t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
+        if (IS_ERR(t))
+                return PTR_ERR(t);
+        if (cpu_online(cpu))
+                kthread_bind(t, cpu);
+        per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+        WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
+        sp.sched_priority = RCU_KTHREAD_PRIO;
+        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+        per_cpu(rcu_cpu_kthread_task, cpu) = t;
+        wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
+        return 0;
+}
+/*
+ * Per-rcu_node kthread, which is in charge of waking up the per-CPU
+ * kthreads when needed.  We ignore requests to wake up kthreads
+ * for offline CPUs, which is OK because force_quiescent_state()
+ * takes care of this case.
+ */
+static int rcu_node_kthread(void *arg)
+{
+        int cpu;
+        unsigned long flags;
+        unsigned long mask;
+        struct rcu_node *rnp = (struct rcu_node *)arg;
+        struct sched_param sp;
+        struct task_struct *t;
+        for (;;) {
+                rnp->node_kthread_status = RCU_KTHREAD_WAITING;
+                rcu_wait(atomic_read(&rnp->wakemask) != 0);
+                rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                mask = atomic_xchg(&rnp->wakemask, 0);
+                rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
+                        if ((mask & 0x1) == 0)
+                                continue;
+                        preempt_disable();
+                        t = per_cpu(rcu_cpu_kthread_task, cpu);
+                        if (!cpu_online(cpu) || t == NULL) {
+                                preempt_enable();
+                                continue;
+                        }
+                        per_cpu(rcu_cpu_has_work, cpu) = 1;
+                        sp.sched_priority = RCU_KTHREAD_PRIO;
+                        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                        preempt_enable();
+                }
+        }
+        /* NOTREACHED */
+        rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
+        return 0;
 }
 /*
@@ -1417,17 +1625,17 @@ static void rcu_cpu_kthread(unsigned int cpu)
 * no outgoing CPU.  If there are no CPUs left in the affinity set,
 * this function allows the kthread to execute on any CPU.
 */
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 {
-        struct task_struct *t = rnp->boost_kthread_task;
-        unsigned long mask = rnp->qsmaskinit;
        cpumask_var_t cm;
        int cpu;
+        unsigned long mask = rnp->qsmaskinit;
-        if (!t)
+        if (rnp->node_kthread_task == NULL)
                return;
-        if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
+        if (!alloc_cpumask_var(&cm, GFP_KERNEL))
                return;
+        cpumask_clear(cm);
        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
                if ((mask & 0x1) && cpu != outgoingcpu)
                        cpumask_set_cpu(cpu, cm);
@@ -1437,36 +1645,62 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
                        cpumask_clear_cpu(cpu, cm);
                WARN_ON_ONCE(cpumask_weight(cm) == 0);
        }
-        set_cpus_allowed_ptr(t, cm);
+        set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
+        rcu_boost_kthread_setaffinity(rnp, cm);
        free_cpumask_var(cm);
 }
-static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+/*
-        .store                  = &rcu_cpu_kthread_task,
+ * Spawn a per-rcu_node kthread, setting priority and affinity.
-        .thread_should_run      = rcu_cpu_kthread_should_run,
+ * Called during boot before online/offline can happen, or, if
-        .thread_fn              = rcu_cpu_kthread,
+ * during runtime, with the main CPU-hotplug locks held.  So only
-        .thread_comm            = "rcuc/%u",
+ * one of these can be executing at a time.
-        .setup                  = rcu_cpu_kthread_setup,
+ */
-        .park                   = rcu_cpu_kthread_park,
+static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
-};
+                                                struct rcu_node *rnp)
+{
+        unsigned long flags;
+        int rnp_index = rnp - &rsp->node[0];
+        struct sched_param sp;
+        struct task_struct *t;
+        if (!rcu_scheduler_fully_active ||
+            rnp->qsmaskinit == 0)
+                return 0;
+        if (rnp->node_kthread_task == NULL) {
+                t = kthread_create(rcu_node_kthread, (void *)rnp,
+                                   "rcun%d", rnp_index);
+                if (IS_ERR(t))
+                        return PTR_ERR(t);
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                rnp->node_kthread_task = t;
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                sp.sched_priority = 99;
+                sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+        }
+        return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
+}
 /*
 * Spawn all kthreads -- called as soon as the scheduler is running.
 */
 static int __init rcu_spawn_kthreads(void)
 {
-        struct rcu_node *rnp;
        int cpu;
+        struct rcu_node *rnp;
        rcu_scheduler_fully_active = 1;
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
                per_cpu(rcu_cpu_has_work, cpu) = 0;
-        BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
+                if (cpu_online(cpu))
+                        (void)rcu_spawn_one_cpu_kthread(cpu);
+        }
        rnp = rcu_get_root(rcu_state);
-        (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
+        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
        if (NUM_RCU_NODES > 1) {
                rcu_for_each_leaf_node(rcu_state, rnp)
-                        (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
+                        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
        }
        return 0;
 }
@@ -1478,8 +1712,11 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
        struct rcu_node *rnp = rdp->mynode;
        /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
-        if (rcu_scheduler_fully_active)
+        if (rcu_scheduler_fully_active) {
-                (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
+                (void)rcu_spawn_one_cpu_kthread(cpu);
+                if (rnp->node_kthread_task == NULL)
+                        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+        }
 }
 #else /* #ifdef CONFIG_RCU_BOOST */
@@ -1494,16 +1731,23 @@ static void invoke_rcu_callbacks_kthread(void)
        WARN_ON_ONCE(1);
 }
-static bool rcu_is_callbacks_kthread(void)
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 {
-        return false;
 }
-static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
+#ifdef CONFIG_HOTPLUG_CPU
+static void rcu_stop_cpu_kthread(int cpu)
+{
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 {
 }
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
 {
 }
@@ -1520,978 +1764,247 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
-#if !defined(CONFIG_RCU_FAST_NO_HZ)
+#ifndef CONFIG_SMP
-/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so.  This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
- *
- * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
- * any flavor of RCU.
- */
-int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
-{
-        *delta_jiffies = ULONG_MAX;
-        return rcu_cpu_has_callbacks(cpu);
-}
-/*
+void synchronize_sched_expedited(void)
- * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
- */
-static void rcu_prepare_for_idle_init(int cpu)
 {
+        cond_resched();
 }
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-/*
+#else /* #ifndef CONFIG_SMP */
- * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
- * after it.
- */
-static void rcu_cleanup_after_idle(int cpu)
-{
-}
-/*
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
- * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
- * is nothing.
- */
-static void rcu_prepare_for_idle(int cpu)
-{
-}
-/*
+static int synchronize_sched_expedited_cpu_stop(void *data)
- * Don't bother keeping a running count of the number of RCU callbacks
- * posted because CONFIG_RCU_FAST_NO_HZ=n.
- */
-static void rcu_idle_count_callbacks_posted(void)
 {
+        /*
+         * There must be a full memory barrier on each affected CPU
+         * between the time that try_stop_cpus() is called and the
+         * time that it returns.
+         *
+         * In the current initial implementation of cpu_stop, the
+         * above condition is already met when the control reaches
+         * this point and the following smp_mb() is not strictly
+         * necessary.  Do smp_mb() anyway for documentation and
+         * robustness against future implementation changes.
+         */
+        smp_mb(); /* See above comment block. */
+        return 0;
 }
-#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
 /*
- * This code is invoked when a CPU goes idle, at which point we want
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * to have the CPU do everything required for RCU so that it can enter
+ * approach to force grace period to end quickly.  This consumes
- * the energy-efficient dyntick-idle mode.  This is handled by a
+ * significant time on all CPUs, and is thus not recommended for
- * state machine implemented by rcu_prepare_for_idle() below.
+ * any sort of common-case code.
 *
- * The following three proprocessor symbols control this state machine:
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
 *
- * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
+ * This implementation can be thought of as an application of ticket
- *      to satisfy RCU.  Beyond this point, it is better to incur a periodic
+ * locking to RCU, with sync_sched_expedited_started and
- *      scheduling-clock interrupt than to loop through the state machine
+ * sync_sched_expedited_done taking on the roles of the halves
- *      at full power.
+ * of the ticket-lock word.  Each task atomically increments
- * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
- *      optional if RCU does not need anything immediately from this
+ * then attempts to stop all the CPUs.  If this succeeds, then each
- *      CPU, even if this CPU still has RCU callbacks queued.  The first
+ * CPU will have executed a context switch, resulting in an RCU-sched
- *      times through the state machine are mandatory: we need to give
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
- *      the state machine a chance to communicate a quiescent state
+ * update sync_sched_expedited_done to match our snapshot -- but
- *      to the RCU core.
+ * only if someone else has not already advanced past our snapshot.
- * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
- *      to sleep in dyntick-idle mode with RCU callbacks pending.  This
- *      is sized to be roughly one RCU grace period.  Those energy-efficiency
- *      benchmarkers who might otherwise be tempted to set this to a large
- *      number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
- *      system.  And if you are -that- concerned about energy efficiency,
- *      just power the system down and be done with it!
- * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
- *      permitted to sleep in dyntick-idle mode with only lazy RCU
- *      callbacks pending.  Setting this too high can OOM your system.
 *
- * The values below work well in practice.  If future workloads require
+ * On the other hand, if try_stop_cpus() fails, we check the value
- * adjustment, they can be converted into kernel config parameters, though
+ * of sync_sched_expedited_done.  If it has advanced past our
- * making the state machine smarter might be a better option.
+ * initial snapshot, then someone else must have forced a grace period
- */
+ * some time after we took our snapshot.  In this case, our work is
-#define RCU_IDLE_FLUSHES 5              /* Number of dyntick-idle tries. */
+ * done for us, and we can simply return.  Otherwise, we try again,
-#define RCU_IDLE_OPT_FLUSHES 3          /* Optional dyntick-idle tries. */
+ * but keep our initial snapshot for purposes of checking for someone
-#define RCU_IDLE_GP_DELAY 4             /* Roughly one grace period. */
+ * doing our work for us.
-#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
-extern int tick_nohz_enabled;
-/*
- * Does the specified flavor of RCU have non-lazy callbacks pending on
- * the specified CPU?  Both RCU flavor and CPU are specified by the
- * rcu_data structure.
 */
-static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
+void synchronize_sched_expedited(void)
 {
-        return rdp->qlen != rdp->qlen_lazy;
+        int firstsnap, s, snap, trycount = 0;
-}
-#ifdef CONFIG_TREE_PREEMPT_RCU
+        /* Note that atomic_inc_return() implies full memory barrier. */
+        firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+        get_online_cpus();
-/*
+        /*
- * Are there non-lazy RCU-preempt callbacks?  (There cannot be if there
+         * Each pass through the following loop attempts to force a
- * is no RCU-preempt in the kernel.)
+         * context switch on each CPU.
- */
+         */
-static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
+        while (try_stop_cpus(cpu_online_mask,
-{
+                             synchronize_sched_expedited_cpu_stop,
-        struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
+                             NULL) == -EAGAIN) {
+                put_online_cpus();
-        return __rcu_cpu_has_nonlazy_callbacks(rdp);
+                /* No joy, try again later.  Or just synchronize_sched(). */
-}
+                if (trycount++ < 10)
+                        udelay(trycount * num_online_cpus());
+                else {
+                        synchronize_sched();
+                        return;
+                }
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+                /* Check to see if someone else did our work for us. */
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+                        smp_mb(); /* ensure test happens before caller kfree */
+                        return;
+                }
-static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
+                /*
-{
+                 * Refetching sync_sched_expedited_started allows later
-        return 0;
+                 * callers to piggyback on our grace period.  We subtract
-}
+                 * 1 to get the same token that the last incrementer got.
+                 * We retry after they started, so our grace period works
+                 * for them, and they started after our first try, so their
+                 * grace period works for us.
+                 */
+                get_online_cpus();
+                snap = atomic_read(&sync_sched_expedited_started) - 1;
+                smp_mb(); /* ensure read is before try_stop_cpus(). */
+        }
-#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
+        /*
+         * Everyone up to our most recent fetch is covered by our grace
+         * period.  Update the counter, but only if our work is still
+         * relevant -- which it won't be if someone who started later
+         * than we did beat us to the punch.
+         */
+        do {
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+                        smp_mb(); /* ensure test happens before caller kfree */
+                        break;
+                }
+        } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
-/*
+        put_online_cpus();
- * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
- */
-static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
-{
-        return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
-               __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
-               rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
 }
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-/*
+#endif /* #else #ifndef CONFIG_SMP */
- * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
- * callbacks on this CPU, (2) this CPU has not yet attempted to enter
- * dyntick-idle mode, or (3) this CPU is in the process of attempting to
- * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
- * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
- * it is better to incur scheduling-clock interrupts than to spin
- * continuously for the same time duration!
- *
- * The delta_jiffies argument is used to store the time when RCU is
- * going to need the CPU again if it still has callbacks.  The reason
- * for this is that rcu_prepare_for_idle() might need to post a timer,
- * but if so, it will do so after tick_nohz_stop_sched_tick() has set
- * the wakeup time for this CPU.  This means that RCU's timer can be
- * delayed until the wakeup time, which defeats the purpose of posting
- * a timer.
- */
-int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
-{
-        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-        /* Flag a new idle sojourn to the idle-entry state machine. */
-        rdtp->idle_first_pass = 1;
-        /* If no callbacks, RCU doesn't need the CPU. */
-        if (!rcu_cpu_has_callbacks(cpu)) {
-                *delta_jiffies = ULONG_MAX;
-                return 0;
-        }
-        if (rdtp->dyntick_holdoff == jiffies) {
-                /* RCU recently tried and failed, so don't try again. */
-                *delta_jiffies = 1;
-                return 1;
-        }
-        /* Set up for the possibility that RCU will post a timer. */
-        if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-                *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
-                                          RCU_IDLE_GP_DELAY) - jiffies;
-        } else {
-                *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
-                *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
-        }
-        return 0;
-}
-/*
+#if !defined(CONFIG_RCU_FAST_NO_HZ)
- * Handler for smp_call_function_single().  The only point of this
- * handler is to wake the CPU up, so the handler does only tracing.
- */
-void rcu_idle_demigrate(void *unused)
-{
-        trace_rcu_prep_idle("Demigrate");
-}
 /*
- * Timer handler used to force CPU to start pushing its remaining RCU
+ * Check to see if any future RCU-related work will need to be done
- * callbacks in the case where it entered dyntick-idle mode with callbacks
+ * by the current CPU, even if none need be done immediately, returning
- * pending.  The hander doesn't really need to do anything because the
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
- * real work is done upon re-entry to idle, or by the next scheduling-clock
+ * an exported member of the RCU API.
- * interrupt should idle not be re-entered.
 *
- * One special case: the timer gets migrated without awakening the CPU
+ * Because we have preemptible RCU, just check whether this CPU needs
- * on which the timer was scheduled on.  In this case, we must wake up
+ * any flavor of RCU.  Do not chew up lots of CPU cycles with preemption
- * that CPU.  We do so with smp_call_function_single().
+ * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
 */
-static void rcu_idle_gp_timer_func(unsigned long cpu_in)
+int rcu_needs_cpu(int cpu)
 {
-        int cpu = (int)cpu_in;
+        return rcu_needs_cpu_quick_check(cpu);
-        trace_rcu_prep_idle("Timer");
-        if (cpu != smp_processor_id())
-                smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
-        else
-                WARN_ON_ONCE(1); /* Getting here can hang the system... */
 }
 /*
- * Initialize the timer used to pull CPUs out of dyntick-idle mode.
+ * Check to see if we need to continue a callback-flush operations to
+ * allow the last CPU to enter dyntick-idle mode.  But fast dyntick-idle
+ * entry is not configured, so we never do need to.
 */
-static void rcu_prepare_for_idle_init(int cpu)
+static void rcu_needs_cpu_flush(void)
 {
-        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-        rdtp->dyntick_holdoff = jiffies - 1;
-        setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
-        rdtp->idle_gp_timer_expires = jiffies - 1;
-        rdtp->idle_first_pass = 1;
 }
-/*
+#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
- * Clean up for exit from idle.  Because we are exiting from idle, there
- * is no longer any point to ->idle_gp_timer, so cancel it.  This will
- * do nothing if this timer is not active, so just cancel it unconditionally.
- */
-static void rcu_cleanup_after_idle(int cpu)
-{
-        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-        del_timer(&rdtp->idle_gp_timer);
+#define RCU_NEEDS_CPU_FLUSHES 5
-        trace_rcu_prep_idle("Cleanup after idle");
+static DEFINE_PER_CPU(int, rcu_dyntick_drain);
-        rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
+static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
-}
 /*
- * Check to see if any RCU-related work can be done by the current CPU,
+ * Check to see if any future RCU-related work will need to be done
- * and if so, schedule a softirq to get it done.  This function is part
+ * by the current CPU, even if none need be done immediately, returning
- * of the RCU implementation; it is -not- an exported member of the RCU API.
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
 *
- * The idea is for the current CPU to clear out all work required by the
+ * Because we are not supporting preemptible RCU, attempt to accelerate
- * RCU core for the current grace period, so that this CPU can be permitted
+ * any current grace periods so that RCU no longer needs this CPU, but
- * to enter dyntick-idle mode.  In some cases, it will need to be awakened
+ * only if all other CPUs are already in dynticks-idle mode.  This will
- * at the end of the grace period by whatever CPU ends the grace period.
+ * allow the CPU cores to be powered down immediately, as opposed to after
- * This allows CPUs to go dyntick-idle more quickly, and to reduce the
+ * waiting many milliseconds for grace periods to elapse.
- * number of wakeups by a modest integer factor.
 *
 * Because it is not legal to invoke rcu_process_callbacks() with irqs
 * disabled, we do one pass of force_quiescent_state(), then do a
 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
- * later.  The ->dyntick_drain field controls the sequencing.
+ * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
- *
- * The caller must have disabled interrupts.
 */
-static void rcu_prepare_for_idle(int cpu)
+int rcu_needs_cpu(int cpu)
 {
-        struct timer_list *tp;
+        int c = 0;
-        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        int snap;
-        int tne;
+        int thatcpu;
-        /* Handle nohz enablement switches conservatively. */
+        /* Check for being in the holdoff period. */
-        tne = ACCESS_ONCE(tick_nohz_enabled);
+        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
-        if (tne != rdtp->tick_nohz_enabled_snap) {
+                return rcu_needs_cpu_quick_check(cpu);
-                if (rcu_cpu_has_callbacks(cpu))
-                        invoke_rcu_core(); /* force nohz to see update. */
-                rdtp->tick_nohz_enabled_snap = tne;
-                return;
-        }
-        if (!tne)
-                return;
-        /* Adaptive-tick mode, where usermode execution is idle to RCU. */
+        /* Don't bother unless we are the last non-dyntick-idle CPU. */
-        if (!is_idle_task(current)) {
+        for_each_online_cpu(thatcpu) {
-                rdtp->dyntick_holdoff = jiffies - 1;
+                if (thatcpu == cpu)
-                if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
+                        continue;
-                        trace_rcu_prep_idle("User dyntick with callbacks");
+                snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
-                        rdtp->idle_gp_timer_expires =
+                                                     thatcpu).dynticks);
-                                round_up(jiffies + RCU_IDLE_GP_DELAY,
+                smp_mb(); /* Order sampling of snap with end of grace period. */
-                                         RCU_IDLE_GP_DELAY);
+                if ((snap & 0x1) != 0) {
-                } else if (rcu_cpu_has_callbacks(cpu)) {
+                        per_cpu(rcu_dyntick_drain, cpu) = 0;
-                        rdtp->idle_gp_timer_expires =
+                        per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
-                                round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
+                        return rcu_needs_cpu_quick_check(cpu);
-                        trace_rcu_prep_idle("User dyntick with lazy callbacks");
-                } else {
-                        return;
-                }
-                tp = &rdtp->idle_gp_timer;
-                mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
-                return;
-        }
-        /*
-         * If this is an idle re-entry, for example, due to use of
-         * RCU_NONIDLE() or the new idle-loop tracing API within the idle
-         * loop, then don't take any state-machine actions, unless the
-         * momentary exit from idle queued additional non-lazy callbacks.
-         * Instead, repost the ->idle_gp_timer if this CPU has callbacks
-         * pending.
-         */
-        if (!rdtp->idle_first_pass &&
-            (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
-                if (rcu_cpu_has_callbacks(cpu)) {
-                        tp = &rdtp->idle_gp_timer;
-                        mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
                }
-                return;
        }
-        rdtp->idle_first_pass = 0;
-        rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
-        /*
+        /* Check and update the rcu_dyntick_drain sequencing. */
-         * If there are no callbacks on this CPU, enter dyntick-idle mode.
+        if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
-         * Also reset state to avoid prejudicing later attempts.
-         */
-        if (!rcu_cpu_has_callbacks(cpu)) {
-                rdtp->dyntick_holdoff = jiffies - 1;
-                rdtp->dyntick_drain = 0;
-                trace_rcu_prep_idle("No callbacks");
-                return;
-        }
-        /*
-         * If in holdoff mode, just return.  We will presumably have
-         * refrained from disabling the scheduling-clock tick.
-         */
-        if (rdtp->dyntick_holdoff == jiffies) {
-                trace_rcu_prep_idle("In holdoff");
-                return;
-        }
-        /* Check and update the ->dyntick_drain sequencing. */
-        if (rdtp->dyntick_drain <= 0) {
                /* First time through, initialize the counter. */
-                rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
+                per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
-        } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
+        } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
-                   !rcu_pending(cpu) &&
-                   !local_softirq_pending()) {
-                /* Can we go dyntick-idle despite still having callbacks? */
-                rdtp->dyntick_drain = 0;
-                rdtp->dyntick_holdoff = jiffies;
-                if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-                        trace_rcu_prep_idle("Dyntick with callbacks");
-                        rdtp->idle_gp_timer_expires =
-                                round_up(jiffies + RCU_IDLE_GP_DELAY,
-                                         RCU_IDLE_GP_DELAY);
-                } else {
-                        rdtp->idle_gp_timer_expires =
-                                round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
-                        trace_rcu_prep_idle("Dyntick with lazy callbacks");
-                }
-                tp = &rdtp->idle_gp_timer;
-                mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
-                rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
-                return; /* Nothing more to do immediately. */
-        } else if (--(rdtp->dyntick_drain) <= 0) {
                /* We have hit the limit, so time to give up. */
-                rdtp->dyntick_holdoff = jiffies;
+                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
-                trace_rcu_prep_idle("Begin holdoff");
+                return rcu_needs_cpu_quick_check(cpu);
-                invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
-                return;
        }
-        /*
+        /* Do one step pushing remaining RCU callbacks through. */
-         * Do one step of pushing the remaining RCU callbacks through
-         * the RCU core state machine.
-         */
-#ifdef CONFIG_TREE_PREEMPT_RCU
-        if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
-                rcu_preempt_qs(cpu);
-                force_quiescent_state(&rcu_preempt_state);
-        }
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
        if (per_cpu(rcu_sched_data, cpu).nxtlist) {
                rcu_sched_qs(cpu);
-                force_quiescent_state(&rcu_sched_state);
+                force_quiescent_state(&rcu_sched_state, 0);
+                c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
        }
        if (per_cpu(rcu_bh_data, cpu).nxtlist) {
                rcu_bh_qs(cpu);
-                force_quiescent_state(&rcu_bh_state);
+                force_quiescent_state(&rcu_bh_state, 0);
+                c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
        }
-        /*
+        /* If RCU callbacks are still pending, RCU still needs this CPU. */
-         * If RCU callbacks are still pending, RCU still needs this CPU.
+        if (c)
-         * So try forcing the callbacks through the grace period.
-         */
-        if (rcu_cpu_has_callbacks(cpu)) {
-                trace_rcu_prep_idle("More callbacks");
                invoke_rcu_core();
-        } else {
+        return c;
-                trace_rcu_prep_idle("Callbacks drained");
-        }
-}
-/*
- * Keep a running count of the number of non-lazy callbacks posted
- * on this CPU.  This running counter (which is never decremented) allows
- * rcu_prepare_for_idle() to detect when something out of the idle loop
- * posts a callback, even if an equal number of callbacks are invoked.
- * Of course, callbacks should only be posted from within a trace event
- * designed to be called from idle or from within RCU_NONIDLE().
- */
-static void rcu_idle_count_callbacks_posted(void)
-{
-        __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
-}
-/*
- * Data for flushing lazy RCU callbacks at OOM time.
- */
-static atomic_t oom_callback_count;
-static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
-/*
- * RCU OOM callback -- decrement the outstanding count and deliver the
- * wake-up if we are the last one.
- */
-static void rcu_oom_callback(struct rcu_head *rhp)
-{
-        if (atomic_dec_and_test(&oom_callback_count))
-                wake_up(&oom_callback_wq);
-}
-/*
- * Post an rcu_oom_notify callback on the current CPU if it has at
- * least one lazy callback.  This will unnecessarily post callbacks
- * to CPUs that already have a non-lazy callback at the end of their
- * callback list, but this is an infrequent operation, so accept some
- * extra overhead to keep things simple.
- */
-static void rcu_oom_notify_cpu(void *unused)
-{
-        struct rcu_state *rsp;
-        struct rcu_data *rdp;
-        for_each_rcu_flavor(rsp) {
-                rdp = __this_cpu_ptr(rsp->rda);
-                if (rdp->qlen_lazy != 0) {
-                        atomic_inc(&oom_callback_count);
-                        rsp->call(&rdp->oom_head, rcu_oom_callback);
-                }
-        }
-}
-/*
- * If low on memory, ensure that each CPU has a non-lazy callback.
- * This will wake up CPUs that have only lazy callbacks, in turn
- * ensuring that they free up the corresponding memory in a timely manner.
- * Because an uncertain amount of memory will be freed in some uncertain
- * timeframe, we do not claim to have freed anything.
- */
-static int rcu_oom_notify(struct notifier_block *self,
-                          unsigned long notused, void *nfreed)
-{
-        int cpu;
-        /* Wait for callbacks from earlier instance to complete. */
-        wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
-        /*
-         * Prevent premature wakeup: ensure that all increments happen
-         * before there is a chance of the counter reaching zero.
-         */
-        atomic_set(&oom_callback_count, 1);
-        get_online_cpus();
-        for_each_online_cpu(cpu) {
-                smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
-                cond_resched();
-        }
-        put_online_cpus();
-        /* Unconditionally decrement: no need to wake ourselves up. */
-        atomic_dec(&oom_callback_count);
-        return NOTIFY_OK;
-}
-static struct notifier_block rcu_oom_nb = {
-        .notifier_call = rcu_oom_notify
-};
-static int __init rcu_register_oom_notifier(void)
-{
-        register_oom_notifier(&rcu_oom_nb);
-        return 0;
-}
-early_initcall(rcu_register_oom_notifier);
-#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
-#ifdef CONFIG_RCU_FAST_NO_HZ
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
-        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-        struct timer_list *tltp = &rdtp->idle_gp_timer;
-        char c;
-        c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
-        if (timer_pending(tltp))
-                sprintf(cp, "drain=%d %c timer=%lu",
-                        rdtp->dyntick_drain, c, tltp->expires - jiffies);
-        else
-                sprintf(cp, "drain=%d %c timer not pending",
-                        rdtp->dyntick_drain, c);
-}
-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
-        *cp = '\0';
-}
-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-/* Initiate the stall-info list. */
-static void print_cpu_stall_info_begin(void)
-{
-        printk(KERN_CONT "\n");
-}
-/*
- * Print out diagnostic information for the specified stalled CPU.
- *
- * If the specified CPU is aware of the current RCU grace period
- * (flavor specified by rsp), then print the number of scheduling
- * clock interrupts the CPU has taken during the time that it has
- * been aware.  Otherwise, print the number of RCU grace periods
- * that this CPU is ignorant of, for example, "1" if the CPU was
- * aware of the previous grace period.
- *
- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
- */
-static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
-{
-        char fast_no_hz[72];
-        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-        struct rcu_dynticks *rdtp = rdp->dynticks;
-        char *ticks_title;
-        unsigned long ticks_value;
-        if (rsp->gpnum == rdp->gpnum) {
-                ticks_title = "ticks this GP";
-                ticks_value = rdp->ticks_this_gp;
-        } else {
-                ticks_title = "GPs behind";
-                ticks_value = rsp->gpnum - rdp->gpnum;
-        }
-        print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-        printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
-               cpu, ticks_value, ticks_title,
-               atomic_read(&rdtp->dynticks) & 0xfff,
-               rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
-               fast_no_hz);
-}
-/* Terminate the stall-info list. */
-static void print_cpu_stall_info_end(void)
-{
-        printk(KERN_ERR "\t");
-}
-/* Zero ->ticks_this_gp for all flavors of RCU. */
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
-        rdp->ticks_this_gp = 0;
-}
-/* Increment ->ticks_this_gp for all flavors of RCU. */
-static void increment_cpu_stall_ticks(void)
-{
-        struct rcu_state *rsp;
-        for_each_rcu_flavor(rsp)
-                __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
-}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
-static void print_cpu_stall_info_begin(void)
-{
-        printk(KERN_CONT " {");
-}
-static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
-{
-        printk(KERN_CONT " %d", cpu);
-}
-static void print_cpu_stall_info_end(void)
-{
-        printk(KERN_CONT "} ");
-}
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
-}
-static void increment_cpu_stall_ticks(void)
-{
-}
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
-#ifdef CONFIG_RCU_NOCB_CPU
-/*
- * Offload callback processing from the boot-time-specified set of CPUs
- * specified by rcu_nocb_mask.  For each CPU in the set, there is a
- * kthread created that pulls the callbacks from the corresponding CPU,
- * waits for a grace period to elapse, and invokes the callbacks.
- * The no-CBs CPUs do a wake_up() on their kthread when they insert
- * a callback into any empty list, unless the rcu_nocb_poll boot parameter
- * has been specified, in which case each kthread actively polls its
- * CPU.  (Which isn't so great for energy efficiency, but which does
- * reduce RCU's overhead on that CPU.)
- *
- * This is intended to be used in conjunction with Frederic Weisbecker's
- * adaptive-idle work, which would seriously reduce OS jitter on CPUs
- * running CPU-bound user-mode computations.
- *
- * Offloading of callback processing could also in theory be used as
- * an energy-efficiency measure because CPUs with no RCU callbacks
- * queued are more aggressive about entering dyntick-idle mode.
- */
-/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
-static int __init rcu_nocb_setup(char *str)
-{
-        alloc_bootmem_cpumask_var(&rcu_nocb_mask);
-        have_rcu_nocb_mask = true;
-        cpulist_parse(str, rcu_nocb_mask);
-        return 1;
-}
-__setup("rcu_nocbs=", rcu_nocb_setup);
-/* Is the specified CPU a no-CPUs CPU? */
-static bool is_nocb_cpu(int cpu)
-{
-        if (have_rcu_nocb_mask)
-                return cpumask_test_cpu(cpu, rcu_nocb_mask);
-        return false;
-}
-/*
- * Enqueue the specified string of rcu_head structures onto the specified
- * CPU's no-CBs lists.  The CPU is specified by rdp, the head of the
- * string by rhp, and the tail of the string by rhtp.  The non-lazy/lazy
- * counts are supplied by rhcount and rhcount_lazy.
- *
- * If warranted, also wake up the kthread servicing this CPUs queues.
- */
-static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
-                                    struct rcu_head *rhp,
-                                    struct rcu_head **rhtp,
-                                    int rhcount, int rhcount_lazy)
-{
-        int len;
-        struct rcu_head **old_rhpp;
-        struct task_struct *t;
-        /* Enqueue the callback on the nocb list and update counts. */
-        old_rhpp = xchg(&rdp->nocb_tail, rhtp);
-        ACCESS_ONCE(*old_rhpp) = rhp;
-        atomic_long_add(rhcount, &rdp->nocb_q_count);
-        atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
-        /* If we are not being polled and there is a kthread, awaken it ... */
-        t = ACCESS_ONCE(rdp->nocb_kthread);
-        if (rcu_nocb_poll | !t)
-                return;
-        len = atomic_long_read(&rdp->nocb_q_count);
-        if (old_rhpp == &rdp->nocb_head) {
-                wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
-                rdp->qlen_last_fqs_check = 0;
-        } else if (len > rdp->qlen_last_fqs_check + qhimark) {
-                wake_up_process(t); /* ... or if many callbacks queued. */
-                rdp->qlen_last_fqs_check = LONG_MAX / 2;
-        }
-        return;
 }
 /*
- * This is a helper for __call_rcu(), which invokes this when the normal
+ * Check to see if we need to continue a callback-flush operations to
- * callback queue is inoperable.  If this is not a no-CBs CPU, this
+ * allow the last CPU to enter dyntick-idle mode.
- * function returns failure back to __call_rcu(), which can complain
- * appropriately.
- *
- * Otherwise, this function queues the callback where the corresponding
- * "rcuo" kthread can find it.
 */
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
+static void rcu_needs_cpu_flush(void)
-                            bool lazy)
 {
+        int cpu = smp_processor_id();
+        unsigned long flags;
-        if (!is_nocb_cpu(rdp->cpu))
+        if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
-                return 0;
-        __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
-        return 1;
-}
-/*
- * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
- * not a no-CBs CPU.
- */
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
-                                                     struct rcu_data *rdp)
-{
-        long ql = rsp->qlen;
-        long qll = rsp->qlen_lazy;
-        /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
-        if (!is_nocb_cpu(smp_processor_id()))
-                return 0;
-        rsp->qlen = 0;
-        rsp->qlen_lazy = 0;
-        /* First, enqueue the donelist, if any.  This preserves CB ordering. */
-        if (rsp->orphan_donelist != NULL) {
-                __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
-                                        rsp->orphan_donetail, ql, qll);
-                ql = qll = 0;
-                rsp->orphan_donelist = NULL;
-                rsp->orphan_donetail = &rsp->orphan_donelist;
-        }
-        if (rsp->orphan_nxtlist != NULL) {
-                __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
-                                        rsp->orphan_nxttail, ql, qll);
-                ql = qll = 0;
-                rsp->orphan_nxtlist = NULL;
-                rsp->orphan_nxttail = &rsp->orphan_nxtlist;
-        }
-        return 1;
-}
-/*
- * There must be at least one non-no-CBs CPU in operation at any given
- * time, because no-CBs CPUs are not capable of initiating grace periods
- * independently.  This function therefore complains if the specified
- * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
- * avoid offlining the last such CPU.  (Recursion is a wonderful thing,
- * but you have to have a base case!)
- */
-static bool nocb_cpu_expendable(int cpu)
-{
-        cpumask_var_t non_nocb_cpus;
-        int ret;
-        /*
-         * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
-         * then offlining this CPU is harmless.  Let it happen.
-         */
-        if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
-                return 1;
-        /* If no memory, play it safe and keep the CPU around. */
-        if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
-                return 0;
-        cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
-        cpumask_clear_cpu(cpu, non_nocb_cpus);
-        ret = !cpumask_empty(non_nocb_cpus);
-        free_cpumask_var(non_nocb_cpus);
-        return ret;
-}
-/*
- * Helper structure for remote registry of RCU callbacks.
- * This is needed for when a no-CBs CPU needs to start a grace period.
- * If it just invokes call_rcu(), the resulting callback will be queued,
- * which can result in deadlock.
- */
-struct rcu_head_remote {
-        struct rcu_head *rhp;
-        call_rcu_func_t *crf;
-        void (*func)(struct rcu_head *rhp);
-};
-/*
- * Register a callback as specified by the rcu_head_remote struct.
- * This function is intended to be invoked via smp_call_function_single().
- */
-static void call_rcu_local(void *arg)
-{
-        struct rcu_head_remote *rhrp =
-                container_of(arg, struct rcu_head_remote, rhp);
-        rhrp->crf(rhrp->rhp, rhrp->func);
-}
-/*
- * Set up an rcu_head_remote structure and the invoke call_rcu_local()
- * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
- * smp_call_function_single().
- */
-static void invoke_crf_remote(struct rcu_head *rhp,
-                              void (*func)(struct rcu_head *rhp),
-                              call_rcu_func_t crf)
-{
-        struct rcu_head_remote rhr;
-        rhr.rhp = rhp;
-        rhr.crf = crf;
-        rhr.func = func;
-        smp_call_function_single(0, call_rcu_local, &rhr, 1);
-}
-/*
- * Helper functions to be passed to wait_rcu_gp(), each of which
- * invokes invoke_crf_remote() to register a callback appropriately.
- */
-static void __maybe_unused
-call_rcu_preempt_remote(struct rcu_head *rhp,
-                        void (*func)(struct rcu_head *rhp))
-{
-        invoke_crf_remote(rhp, func, call_rcu);
-}
-static void call_rcu_bh_remote(struct rcu_head *rhp,
-                               void (*func)(struct rcu_head *rhp))
-{
-        invoke_crf_remote(rhp, func, call_rcu_bh);
-}
-static void call_rcu_sched_remote(struct rcu_head *rhp,
-                                  void (*func)(struct rcu_head *rhp))
-{
-        invoke_crf_remote(rhp, func, call_rcu_sched);
-}
-/*
- * Per-rcu_data kthread, but only for no-CBs CPUs.  Each kthread invokes
- * callbacks queued by the corresponding no-CBs CPU.
- */
-static int rcu_nocb_kthread(void *arg)
-{
-        int c, cl;
-        struct rcu_head *list;
-        struct rcu_head *next;
-        struct rcu_head **tail;
-        struct rcu_data *rdp = arg;
-        /* Each pass through this loop invokes one batch of callbacks */
-        for (;;) {
-                /* If not polling, wait for next batch of callbacks. */
-                if (!rcu_nocb_poll)
-                        wait_event(rdp->nocb_wq, rdp->nocb_head);
-                list = ACCESS_ONCE(rdp->nocb_head);
-                if (!list) {
-                        schedule_timeout_interruptible(1);
-                        continue;
-                }
-                /*
-                 * Extract queued callbacks, update counts, and wait
-                 * for a grace period to elapse.
-                 */
-                ACCESS_ONCE(rdp->nocb_head) = NULL;
-                tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
-                c = atomic_long_xchg(&rdp->nocb_q_count, 0);
-                cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
-                ACCESS_ONCE(rdp->nocb_p_count) += c;
-                ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
-                wait_rcu_gp(rdp->rsp->call_remote);
-                /* Each pass through the following loop invokes a callback. */
-                trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
-                c = cl = 0;
-                while (list) {
-                        next = list->next;
-                        /* Wait for enqueuing to complete, if needed. */
-                        while (next == NULL && &list->next != tail) {
-                                schedule_timeout_interruptible(1);
-                                next = list->next;
-                        }
-                        debug_rcu_head_unqueue(list);
-                        local_bh_disable();
-                        if (__rcu_reclaim(rdp->rsp->name, list))
-                                cl++;
-                        c++;
-                        local_bh_enable();
-                        list = next;
-                }
-                trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
-                ACCESS_ONCE(rdp->nocb_p_count) -= c;
-                ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
-                rdp->n_nocbs_invoked += c;
-        }
-        return 0;
-}
-/* Initialize per-rcu_data variables for no-CBs CPUs. */
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
-        rdp->nocb_tail = &rdp->nocb_head;
-        init_waitqueue_head(&rdp->nocb_wq);
-}
-/* Create a kthread for each RCU flavor for each no-CBs CPU. */
-static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
-{
-        int cpu;
-        struct rcu_data *rdp;
-        struct task_struct *t;
-        if (rcu_nocb_mask == NULL)
-                return;
-        for_each_cpu(cpu, rcu_nocb_mask) {
-                rdp = per_cpu_ptr(rsp->rda, cpu);
-                t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
-                BUG_ON(IS_ERR(t));
-                ACCESS_ONCE(rdp->nocb_kthread) = t;
-        }
-}
-/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
-static void init_nocb_callback_list(struct rcu_data *rdp)
-{
-        if (rcu_nocb_mask == NULL ||
-            !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
                return;
-        rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+        local_irq_save(flags);
-}
+        (void)rcu_needs_cpu(cpu);
+        local_irq_restore(flags);
-/* Initialize the ->call_remote fields in the rcu_state structures. */
-static void __init rcu_init_nocb(void)
-{
-#ifdef CONFIG_PREEMPT_RCU
-        rcu_preempt_state.call_remote = call_rcu_preempt_remote;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-        rcu_bh_state.call_remote = call_rcu_bh_remote;
-        rcu_sched_state.call_remote = call_rcu_sched_remote;
-}
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static bool is_nocb_cpu(int cpu)
-{
-        return false;
-}
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
-                            bool lazy)
-{
-        return 0;
-}
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
-                                                     struct rcu_data *rdp)
-{
-        return 0;
-}
-static bool nocb_cpu_expendable(int cpu)
-{
-        return 1;
-}
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
-}
-static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
-{
-}
-static void init_nocb_callback_list(struct rcu_data *rdp)
-{
-}
-static void __init rcu_init_nocb(void)
-{
 }
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0d095dcaa67..3b0c0986afc 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,62 +46,13 @@
 #define RCU_TREE_NONCORE
 #include "rcutree.h"
-#define ulong2long(a) (*(long *)(&(a)))
-static int r_open(struct inode *inode, struct file *file,
-                                        const struct seq_operations *op)
-{
-        int ret = seq_open(file, op);
-        if (!ret) {
-                struct seq_file *m = (struct seq_file *)file->private_data;
-                m->private = inode->i_private;
-        }
-        return ret;
-}
-static void *r_start(struct seq_file *m, loff_t *pos)
-{
-        struct rcu_state *rsp = (struct rcu_state *)m->private;
-        *pos = cpumask_next(*pos - 1, cpu_possible_mask);
-        if ((*pos) < nr_cpu_ids)
-                return per_cpu_ptr(rsp->rda, *pos);
-        return NULL;
-}
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-{
-        (*pos)++;
-        return r_start(m, pos);
-}
-static void r_stop(struct seq_file *m, void *v)
-{
-}
-static int show_rcubarrier(struct seq_file *m, void *v)
-{
-        struct rcu_state *rsp = (struct rcu_state *)m->private;
-        seq_printf(m, "bcc: %d nbd: %lu\n",
-                   atomic_read(&rsp->barrier_cpu_count),
-                   rsp->n_barrier_done);
-        return 0;
-}
-static int rcubarrier_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, show_rcubarrier, inode->i_private);
-}
-static const struct file_operations rcubarrier_fops = {
-        .owner = THIS_MODULE,
-        .open = rcubarrier_open,
-        .read = seq_read,
-        .llseek = no_llseek,
-        .release = seq_release,
-};
 #ifdef CONFIG_RCU_BOOST
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DECLARE_PER_CPU(char, rcu_cpu_has_work);
 static char convert_kthread_status(unsigned int kthread_status)
 {
        if (kthread_status > RCU_KTHREAD_MAX)
@@ -113,26 +64,24 @@ static char convert_kthread_status(unsigned int kthread_status)
 static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
-        long ql, qll;
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
+        seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
-                   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
+                   rdp->completed, rdp->gpnum,
-                   rdp->passed_quiesce, rdp->qs_pending);
+                   rdp->passed_quiesc, rdp->passed_quiesc_completed,
-        seq_printf(m, " dt=%d/%llx/%d df=%lu",
+                   rdp->qs_pending);
+#ifdef CONFIG_NO_HZ
+        seq_printf(m, " dt=%d/%d/%d df=%lu",
                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
-        seq_printf(m, " of=%lu", rdp->offline_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
-        rcu_nocb_q_lengths(rdp, &ql, &qll);
+        seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
-        qll += rdp->qlen_lazy;
+        seq_printf(m, " ql=%ld qs=%c%c%c%c",
-        ql += rdp->qlen;
+                   rdp->qlen,
-        seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
-                   qll, ql,
                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
                        rdp->nxttail[RCU_NEXT_TAIL]],
                   ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -141,81 +90,130 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                        rdp->nxttail[RCU_WAIT_TAIL]],
                   ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
 #ifdef CONFIG_RCU_BOOST
-        seq_printf(m, " kt=%d/%c ktl=%x",
+        seq_printf(m, " kt=%d/%c/%d ktl=%x",
                   per_cpu(rcu_cpu_has_work, rdp->cpu),
                   convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
                                          rdp->cpu)),
+                   per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
                   per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
 #endif /* #ifdef CONFIG_RCU_BOOST */
        seq_printf(m, " b=%ld", rdp->blimit);
-        seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
+        seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
-                   rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
+                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
-                   rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
-static int show_rcudata(struct seq_file *m, void *v)
+#define PRINT_RCU_DATA(name, func, m) \
+        do { \
+                int _p_r_d_i; \
+                \
+                for_each_possible_cpu(_p_r_d_i) \
+                        func(m, &per_cpu(name, _p_r_d_i)); \
+        } while (0)
+static int show_rcudata(struct seq_file *m, void *unused)
 {
-        print_one_rcu_data(m, (struct rcu_data *)v);
+#ifdef CONFIG_TREE_PREEMPT_RCU
+        seq_puts(m, "rcu_preempt:\n");
+        PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+        seq_puts(m, "rcu_sched:\n");
+        PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
+        seq_puts(m, "rcu_bh:\n");
+        PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
        return 0;
 }
-static const struct seq_operations rcudate_op = {
-        .start = r_start,
-        .next  = r_next,
-        .stop  = r_stop,
-        .show  = show_rcudata,
-};
 static int rcudata_open(struct inode *inode, struct file *file)
 {
-        return r_open(inode, file, &rcudate_op);
+        return single_open(file, show_rcudata, NULL);
 }
 static const struct file_operations rcudata_fops = {
        .owner = THIS_MODULE,
        .open = rcudata_open,
        .read = seq_read,
-        .llseek = no_llseek,
+        .llseek = seq_lseek,
-        .release = seq_release,
+        .release = single_release,
 };
-static int show_rcuexp(struct seq_file *m, void *v)
+static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 {
-        struct rcu_state *rsp = (struct rcu_state *)m->private;
+        if (!rdp->beenonline)
+                return;
-        seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
+        seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
-                   atomic_long_read(&rsp->expedited_start),
+                   rdp->cpu,
-                   atomic_long_read(&rsp->expedited_done),
+                   cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
-                   atomic_long_read(&rsp->expedited_wrap),
+                   rdp->completed, rdp->gpnum,
-                   atomic_long_read(&rsp->expedited_tryfail),
+                   rdp->passed_quiesc, rdp->passed_quiesc_completed,
-                   atomic_long_read(&rsp->expedited_workdone1),
+                   rdp->qs_pending);
-                   atomic_long_read(&rsp->expedited_workdone2),
+#ifdef CONFIG_NO_HZ
-                   atomic_long_read(&rsp->expedited_normal),
+        seq_printf(m, ",%d,%d,%d,%lu",
-                   atomic_long_read(&rsp->expedited_stoppedcpus),
+                   atomic_read(&rdp->dynticks->dynticks),
-                   atomic_long_read(&rsp->expedited_done_tries),
+                   rdp->dynticks->dynticks_nesting,
-                   atomic_long_read(&rsp->expedited_done_lost),
+                   rdp->dynticks->dynticks_nmi_nesting,
-                   atomic_long_read(&rsp->expedited_done_exit));
+                   rdp->dynticks_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
+        seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
+        seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
+                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
+                        rdp->nxttail[RCU_NEXT_TAIL]],
+                   ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
+                        rdp->nxttail[RCU_NEXT_READY_TAIL]],
+                   ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
+                        rdp->nxttail[RCU_WAIT_TAIL]],
+                   ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
+#ifdef CONFIG_RCU_BOOST
+        seq_printf(m, ",%d,\"%c\"",
+                   per_cpu(rcu_cpu_has_work, rdp->cpu),
+                   convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
+                                          rdp->cpu)));
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        seq_printf(m, ",%ld", rdp->blimit);
+        seq_printf(m, ",%lu,%lu,%lu\n",
+                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
+}
+static int show_rcudata_csv(struct seq_file *m, void *unused)
+{
+        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
+#ifdef CONFIG_NO_HZ
+        seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
+#endif /* #ifdef CONFIG_NO_HZ */
+        seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
+#ifdef CONFIG_RCU_BOOST
+        seq_puts(m, "\"kt\",\"ktl\"");
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
+#ifdef CONFIG_TREE_PREEMPT_RCU
+        seq_puts(m, "\"rcu_preempt:\"\n");
+        PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+        seq_puts(m, "\"rcu_sched:\"\n");
+        PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
+        seq_puts(m, "\"rcu_bh:\"\n");
+        PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
        return 0;
 }
-static int rcuexp_open(struct inode *inode, struct file *file)
+static int rcudata_csv_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, show_rcuexp, inode->i_private);
+        return single_open(file, show_rcudata_csv, NULL);
 }
-static const struct file_operations rcuexp_fops = {
+static const struct file_operations rcudata_csv_fops = {
        .owner = THIS_MODULE,
-        .open = rcuexp_open,
+        .open = rcudata_csv_open,
        .read = seq_read,
-        .llseek = no_llseek,
+        .llseek = seq_lseek,
-        .release = seq_release,
+        .release = single_release,
 };
 #ifdef CONFIG_RCU_BOOST
 static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
 {
-        seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
+        seq_printf(m,  "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
+                   "j=%04x bt=%04x\n",
                   rnp->grplo, rnp->grphi,
                   "T."[list_empty(&rnp->blkd_tasks)],
                   "N."[!rnp->gp_tasks],
@@ -223,11 +221,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
                   "B."[!rnp->boost_tasks],
                   convert_kthread_status(rnp->boost_kthread_status),
                   rnp->n_tasks_boosted, rnp->n_exp_boosts,
-                   rnp->n_normal_boosts);
+                   rnp->n_normal_boosts,
-        seq_printf(m, "j=%04x bt=%04x\n",
                   (int)(jiffies & 0xffff),
                   (int)(rnp->boost_time & 0xffff));
-        seq_printf(m, "    balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
+        seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
+                   "     balk",
                   rnp->n_balk_blkd_tasks,
                   rnp->n_balk_exp_gp_tasks,
                   rnp->n_balk_boost_tasks,
@@ -254,11 +252,27 @@ static const struct file_operations rcu_node_boost_fops = {
        .owner = THIS_MODULE,
        .open = rcu_node_boost_open,
        .read = seq_read,
-        .llseek = no_llseek,
+        .llseek = seq_lseek,
        .release = single_release,
 };
-#endif /* #ifdef CONFIG_RCU_BOOST */
+/*
+ * Create the rcuboost debugfs entry.  Standard error return.
+ */
+static int rcu_boost_trace_create_file(struct dentry *rcudir)
+{
+        return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
+                                    &rcu_node_boost_fops);
+}
+#else /* #ifdef CONFIG_RCU_BOOST */
+static int rcu_boost_trace_create_file(struct dentry *rcudir)
+{
+        return 0;  /* There cannot be an error if we didn't create it! */
+}
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
 static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 {
@@ -267,16 +281,15 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        struct rcu_node *rnp;
        gpnum = rsp->gpnum;
-        seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
+        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
-                   ulong2long(rsp->completed), ulong2long(gpnum),
+                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
-                   rsp->fqs_state,
+                   rsp->completed, gpnum, rsp->signaled,
                   (long)(rsp->jiffies_force_qs - jiffies),
-                   (int)(jiffies & 0xffff));
+                   (int)(jiffies & 0xffff),
-        seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
                   rsp->n_force_qs - rsp->n_force_qs_ngp,
-                   rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
+                   rsp->n_force_qs_lh);
-        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
+        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
                if (rnp->level != level) {
                        seq_puts(m, "\n");
                        level = rnp->level;
@@ -291,24 +304,30 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        seq_puts(m, "\n");
 }
-static int show_rcuhier(struct seq_file *m, void *v)
+static int show_rcuhier(struct seq_file *m, void *unused)
 {
-        struct rcu_state *rsp = (struct rcu_state *)m->private;
+#ifdef CONFIG_TREE_PREEMPT_RCU
-        print_one_rcu_state(m, rsp);
+        seq_puts(m, "rcu_preempt:\n");
+        print_one_rcu_state(m, &rcu_preempt_state);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+        seq_puts(m, "rcu_sched:\n");
+        print_one_rcu_state(m, &rcu_sched_state);
+        seq_puts(m, "rcu_bh:\n");
+        print_one_rcu_state(m, &rcu_bh_state);
        return 0;
 }
 static int rcuhier_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, show_rcuhier, inode->i_private);
+        return single_open(file, show_rcuhier, NULL);
 }
 static const struct file_operations rcuhier_fops = {
        .owner = THIS_MODULE,
        .open = rcuhier_open,
        .read = seq_read,
-        .llseek = no_llseek,
+        .llseek = seq_lseek,
-        .release = seq_release,
+        .release = single_release,
 };
 static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -321,81 +340,95 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
        struct rcu_node *rnp = &rsp->node[0];
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        completed = ACCESS_ONCE(rsp->completed);
+        completed = rsp->completed;
-        gpnum = ACCESS_ONCE(rsp->gpnum);
+        gpnum = rsp->gpnum;
-        if (completed == gpnum)
+        if (rsp->completed == rsp->gpnum)
                gpage = 0;
        else
                gpage = jiffies - rsp->gp_start;
        gpmax = rsp->gp_max;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        seq_printf(m, "completed=%ld  gpnum=%ld  age=%ld  max=%ld\n",
+        seq_printf(m, "%s: completed=%ld  gpnum=%lu  age=%ld  max=%ld\n",
-                   ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
+                   rsp->name, completed, gpnum, gpage, gpmax);
 }
-static int show_rcugp(struct seq_file *m, void *v)
+static int show_rcugp(struct seq_file *m, void *unused)
 {
-        struct rcu_state *rsp = (struct rcu_state *)m->private;
+#ifdef CONFIG_TREE_PREEMPT_RCU
-        show_one_rcugp(m, rsp);
+        show_one_rcugp(m, &rcu_preempt_state);
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+        show_one_rcugp(m, &rcu_sched_state);
+        show_one_rcugp(m, &rcu_bh_state);
        return 0;
 }
 static int rcugp_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, show_rcugp, inode->i_private);
+        return single_open(file, show_rcugp, NULL);
 }
 static const struct file_operations rcugp_fops = {
        .owner = THIS_MODULE,
        .open = rcugp_open,
        .read = seq_read,
-        .llseek = no_llseek,
+        .llseek = seq_lseek,
-        .release = seq_release,
+        .release = single_release,
 };
 static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
 {
-        if (!rdp->beenonline)
+        seq_printf(m, "%3d%cnp=%ld "
-                return;
+                   "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
-        seq_printf(m, "%3d%cnp=%ld ",
+                   "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
-                   rdp->n_rcu_pending);
+                   rdp->n_rcu_pending,
-        seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
                   rdp->n_rp_qs_pending,
                   rdp->n_rp_report_qs,
                   rdp->n_rp_cb_ready,
-                   rdp->n_rp_cpu_needs_gp);
+                   rdp->n_rp_cpu_needs_gp,
-        seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n",
                   rdp->n_rp_gp_completed,
                   rdp->n_rp_gp_started,
+                   rdp->n_rp_need_fqs,
                   rdp->n_rp_need_nothing);
 }
-static int show_rcu_pending(struct seq_file *m, void *v)
+static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
 {
-        print_one_rcu_pending(m, (struct rcu_data *)v);
+        int cpu;
-        return 0;
+        struct rcu_data *rdp;
+        for_each_possible_cpu(cpu) {
+                rdp = per_cpu_ptr(rsp->rda, cpu);
+                if (rdp->beenonline)
+                        print_one_rcu_pending(m, rdp);
+        }
 }
-static const struct seq_operations rcu_pending_op = {
+static int show_rcu_pending(struct seq_file *m, void *unused)
-        .start = r_start,
+{
-        .next  = r_next,
+#ifdef CONFIG_TREE_PREEMPT_RCU
-        .stop  = r_stop,
+        seq_puts(m, "rcu_preempt:\n");
-        .show  = show_rcu_pending,
+        print_rcu_pendings(m, &rcu_preempt_state);
-};
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+        seq_puts(m, "rcu_sched:\n");
+        print_rcu_pendings(m, &rcu_sched_state);
+        seq_puts(m, "rcu_bh:\n");
+        print_rcu_pendings(m, &rcu_bh_state);
+        return 0;
+}
 static int rcu_pending_open(struct inode *inode, struct file *file)
 {
-        return r_open(inode, file, &rcu_pending_op);
+        return single_open(file, show_rcu_pending, NULL);
 }
 static const struct file_operations rcu_pending_fops = {
        .owner = THIS_MODULE,
        .open = rcu_pending_open,
        .read = seq_read,
-        .llseek = no_llseek,
+        .llseek = seq_lseek,
-        .release = seq_release,
+        .release = single_release,
 };
 static int show_rcutorture(struct seq_file *m, void *unused)
@@ -425,58 +458,38 @@ static struct dentry *rcudir;
 static int __init rcutree_trace_init(void)
 {
-        struct rcu_state *rsp;
        struct dentry *retval;
-        struct dentry *rspdir;
        rcudir = debugfs_create_dir("rcu", NULL);
        if (!rcudir)
                goto free_out;
-        for_each_rcu_flavor(rsp) {
+        retval = debugfs_create_file("rcudata", 0444, rcudir,
-                rspdir = debugfs_create_dir(rsp->name, rcudir);
+                                                NULL, &rcudata_fops);
-                if (!rspdir)
+        if (!retval)
-                        goto free_out;
+                goto free_out;
-                retval = debugfs_create_file("rcudata", 0444,
-                                rspdir, rsp, &rcudata_fops);
-                if (!retval)
-                        goto free_out;
-                retval = debugfs_create_file("rcuexp", 0444,
-                                rspdir, rsp, &rcuexp_fops);
-                if (!retval)
-                        goto free_out;
-                retval = debugfs_create_file("rcu_pending", 0444,
+        retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
-                                rspdir, rsp, &rcu_pending_fops);
+                                                NULL, &rcudata_csv_fops);
-                if (!retval)
+        if (!retval)
-                        goto free_out;
+                goto free_out;
-                retval = debugfs_create_file("rcubarrier", 0444,
+        if (rcu_boost_trace_create_file(rcudir))
-                                rspdir, rsp, &rcubarrier_fops);
+                goto free_out;
-                if (!retval)
-                        goto free_out;
-#ifdef CONFIG_RCU_BOOST
+        retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
-                if (rsp == &rcu_preempt_state) {
+        if (!retval)
-                        retval = debugfs_create_file("rcuboost", 0444,
+                goto free_out;
-                                rspdir, NULL, &rcu_node_boost_fops);
-                        if (!retval)
-                                goto free_out;
-                }
-#endif
-                retval = debugfs_create_file("rcugp", 0444,
+        retval = debugfs_create_file("rcuhier", 0444, rcudir,
-                                rspdir, rsp, &rcugp_fops);
+                                                NULL, &rcuhier_fops);
-                if (!retval)
+        if (!retval)
-                        goto free_out;
+                goto free_out;
-                retval = debugfs_create_file("rcuhier", 0444,
+        retval = debugfs_create_file("rcu_pending", 0444, rcudir,
-                                rspdir, rsp, &rcuhier_fops);
+                                                NULL, &rcu_pending_fops);
-                if (!retval)
+        if (!retval)
-                        goto free_out;
+                goto free_out;
-        }
        retval = debugfs_create_file("rcutorture", 0444, rcudir,
                                                NULL, &rcutorture_fops);
diff --git a/kernel/relay.c b/kernel/relay.c
index e8cd2027abb..859ea5a9605 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -15,7 +15,7 @@
 #include <linux/errno.h>
 #include <linux/stddef.h>
 #include <linux/slab.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/relay.h>
 #include <linux/vmalloc.h>
@@ -164,14 +164,10 @@ depopulate:
 */
 static struct rchan_buf *relay_create_buf(struct rchan *chan)
 {
-        struct rchan_buf *buf;
+        struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
-        if (chan->n_subbufs > UINT_MAX / sizeof(size_t *))
-                return NULL;
-        buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
        if (!buf)
                return NULL;
        buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
        if (!buf->padding)
                goto free_buf;
@@ -306,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
 */
 static struct dentry *create_buf_file_default_callback(const char *filename,
                                                       struct dentry *parent,
-                                                       umode_t mode,
+                                                       int mode,
                                                       struct rchan_buf *buf,
                                                       int *is_global)
 {
@@ -578,8 +574,6 @@ struct rchan *relay_open(const char *base_filename,
        if (!(subbuf_size && n_subbufs))
                return NULL;
-        if (subbuf_size > UINT_MAX / n_subbufs)
-                return NULL;
        chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
        if (!chan)
@@ -1235,7 +1229,6 @@ static ssize_t subbuf_splice_actor(struct file *in,
        struct splice_pipe_desc spd = {
                .pages = pages,
                .nr_pages = 0,
-                .nr_pages_max = PIPE_DEF_BUFFERS,
                .partial = partial,
                .flags = flags,
                .ops = &relay_pipe_buf_ops,
@@ -1303,8 +1296,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
                ret += padding;
 out:
-        splice_shrink_spd(&spd);
+        splice_shrink_spd(pipe, &spd);
-        return ret;
+        return ret;
 }
 static ssize_t relay_file_splice_read(struct file *in,
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ff55247e704..34683efa2cc 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,104 +22,72 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
        counter->parent = parent;
 }
-int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
+int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
-                              bool force)
 {
-        int ret = 0;
        if (counter->usage + val > counter->limit) {
                counter->failcnt++;
-                ret = -ENOMEM;
+                return -ENOMEM;
-                if (!force)
-                        return ret;
        }
        counter->usage += val;
        if (counter->usage > counter->max_usage)
                counter->max_usage = counter->usage;
-        return ret;
+        return 0;
 }
-static int __res_counter_charge(struct res_counter *counter, unsigned long val,
+int res_counter_charge(struct res_counter *counter, unsigned long val,
-                                struct res_counter **limit_fail_at, bool force)
+                        struct res_counter **limit_fail_at)
 {
-        int ret, r;
+        int ret;
        unsigned long flags;
        struct res_counter *c, *u;
-        r = ret = 0;
        *limit_fail_at = NULL;
        local_irq_save(flags);
        for (c = counter; c != NULL; c = c->parent) {
                spin_lock(&c->lock);
-                r = res_counter_charge_locked(c, val, force);
+                ret = res_counter_charge_locked(c, val);
                spin_unlock(&c->lock);
-                if (r < 0 && !ret) {
+                if (ret < 0) {
-                        ret = r;
                        *limit_fail_at = c;
-                        if (!force)
+                        goto undo;
-                                break;
                }
        }
+        ret = 0;
-        if (ret < 0 && !force) {
+        goto done;
-                for (u = counter; u != c; u = u->parent) {
+undo:
-                        spin_lock(&u->lock);
+        for (u = counter; u != c; u = u->parent) {
-                        res_counter_uncharge_locked(u, val);
+                spin_lock(&u->lock);
-                        spin_unlock(&u->lock);
+                res_counter_uncharge_locked(u, val);
-                }
+                spin_unlock(&u->lock);
        }
+done:
        local_irq_restore(flags);
        return ret;
 }
-int res_counter_charge(struct res_counter *counter, unsigned long val,
+void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
-                        struct res_counter **limit_fail_at)
-{
-        return __res_counter_charge(counter, val, limit_fail_at, false);
-}
-int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
-                              struct res_counter **limit_fail_at)
-{
-        return __res_counter_charge(counter, val, limit_fail_at, true);
-}
-u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 {
        if (WARN_ON(counter->usage < val))
                val = counter->usage;
        counter->usage -= val;
-        return counter->usage;
 }
-u64 res_counter_uncharge_until(struct res_counter *counter,
+void res_counter_uncharge(struct res_counter *counter, unsigned long val)
-                               struct res_counter *top,
-                               unsigned long val)
 {
        unsigned long flags;
        struct res_counter *c;
-        u64 ret = 0;
        local_irq_save(flags);
-        for (c = counter; c != top; c = c->parent) {
+        for (c = counter; c != NULL; c = c->parent) {
-                u64 r;
                spin_lock(&c->lock);
-                r = res_counter_uncharge_locked(c, val);
+                res_counter_uncharge_locked(c, val);
-                if (c == counter)
-                        ret = r;
                spin_unlock(&c->lock);
        }
        local_irq_restore(flags);
-        return ret;
 }
-u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
-{
-        return res_counter_uncharge_until(counter, NULL, val);
-}
 static inline unsigned long long *
 res_counter_member(struct res_counter *counter, int member)
@@ -191,10 +159,33 @@ int res_counter_memparse_write_strategy(const char *buf,
                return 0;
        }
-        *res = memparse(buf, &end);
+        /* FIXME - make memparse() take const char* args */
+        *res = memparse((char *)buf, &end);
        if (*end != '\0')
                return -EINVAL;
        *res = PAGE_ALIGN(*res);
        return 0;
 }
+int res_counter_write(struct res_counter *counter, int member,
+                      const char *buf, write_strategy_fn write_strategy)
+{
+        char *end;
+        unsigned long flags;
+        unsigned long long tmp, *val;
+        if (write_strategy) {
+                if (write_strategy(buf, &tmp))
+                        return -EINVAL;
+        } else {
+                tmp = simple_strtoull(buf, &end, 10);
+                if (*end != '\0')
+                        return -EINVAL;
+        }
+        spin_lock_irqsave(&counter->lock, flags);
+        val = res_counter_member(counter, member);
+        *val = tmp;
+        spin_unlock_irqrestore(&counter->lock, flags);
+        return 0;
+}
diff --git a/kernel/resource.c b/kernel/resource.c
index 73f35d4b30b..c8dc249da5c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,9 +7,7 @@
 * Arbitrary resource management.
 */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
-#include <linux/export.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
@@ -517,8 +515,8 @@ out:
 * @root: root resource descriptor
 * @new: resource descriptor desired by caller
 * @size: requested resource region size
- * @min: minimum boundary to allocate
+ * @min: minimum size to allocate
- * @max: maximum boundary to allocate
+ * @max: maximum size to allocate
 * @align: alignment requested, in bytes
 * @alignf: alignment function, optional, called if not NULL
 * @alignf_data: arbitrary data to pass to the @alignf function
@@ -724,12 +722,14 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
        write_lock(&resource_lock);
-        if (!parent)
-                goto skip;
        if ((start < parent->start) || (end > parent->end))
                goto out;
+        for (tmp = res->child; tmp; tmp = tmp->sibling) {
+                if ((tmp->start < start) || (tmp->end > end))
+                        goto out;
+        }
        if (res->sibling && (res->sibling->start <= end))
                goto out;
@@ -741,11 +741,6 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
                        goto out;
        }
-skip:
-        for (tmp = res->child; tmp; tmp = tmp->sibling)
-                if ((tmp->start < start) || (tmp->end > end))
-                        goto out;
        res->start = start;
        res->end = end;
        result = 0;
@@ -754,7 +749,6 @@ skip:
        write_unlock(&resource_lock);
        return result;
 }
-EXPORT_SYMBOL(adjust_resource);
 static void __init __reserve_region_with_split(struct resource *root,
                resource_size_t start, resource_size_t end,
@@ -763,7 +757,6 @@ static void __init __reserve_region_with_split(struct resource *root,
        struct resource *parent = root;
        struct resource *conflict;
        struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
-        struct resource *next_res = NULL;
        if (!res)
                return;
@@ -773,77 +766,34 @@ static void __init __reserve_region_with_split(struct resource *root,
        res->end = end;
        res->flags = IORESOURCE_BUSY;
-        while (1) {
+        conflict = __request_resource(parent, res);
+        if (!conflict)
-                conflict = __request_resource(parent, res);
+                return;
-                if (!conflict) {
-                        if (!next_res)
-                                break;
-                        res = next_res;
-                        next_res = NULL;
-                        continue;
-                }
-                /* conflict covered whole area */
+        /* failed, split and try again */
-                if (conflict->start <= res->start &&
+        kfree(res);
-                                conflict->end >= res->end) {
-                        kfree(res);
-                        WARN_ON(next_res);
-                        break;
-                }
-                /* failed, split and try again */
+        /* conflict covered whole area */
-                if (conflict->start > res->start) {
+        if (conflict->start <= start && conflict->end >= end)
-                        end = res->end;
+                return;
-                        res->end = conflict->start - 1;
-                        if (conflict->end < end) {
-                                next_res = kzalloc(sizeof(*next_res),
-                                                GFP_ATOMIC);
-                                if (!next_res) {
-                                        kfree(res);
-                                        break;
-                                }
-                                next_res->name = name;
-                                next_res->start = conflict->end + 1;
-                                next_res->end = end;
-                                next_res->flags = IORESOURCE_BUSY;
-                        }
-                } else {
-                        res->start = conflict->end + 1;
-                }
-        }
+        if (conflict->start > start)
+                __reserve_region_with_split(root, start, conflict->start-1, name);
+        if (conflict->end < end)
+                __reserve_region_with_split(root, conflict->end+1, end, name);
 }
 void __init reserve_region_with_split(struct resource *root,
                resource_size_t start, resource_size_t end,
                const char *name)
 {
-        int abort = 0;
        write_lock(&resource_lock);
-        if (root->start > start || root->end < end) {
+        __reserve_region_with_split(root, start, end, name);
-                pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
-                       (unsigned long long)start, (unsigned long long)end,
-                       root);
-                if (start > root->end || end < root->start)
-                        abort = 1;
-                else {
-                        if (end > root->end)
-                                end = root->end;
-                        if (start < root->start)
-                                start = root->start;
-                        pr_err("fixing request to [0x%llx-0x%llx]\n",
-                               (unsigned long long)start,
-                               (unsigned long long)end);
-                }
-                dump_stack();
-        }
-        if (!abort)
-                __reserve_region_with_split(root, start, end, name);
        write_unlock(&resource_lock);
 }
+EXPORT_SYMBOL(adjust_resource);
 /**
 * resource_alignment - calculate resource's alignment
 * @res: resource pointer
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 16502d3a71c..3c7cbc2c33b 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -18,7 +18,7 @@
 */
 #include <linux/sched.h>
 #include <linux/delay.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/syscalls.h>
@@ -29,6 +29,61 @@
 #include "rtmutex_common.h"
+# define TRACE_WARN_ON(x)                       WARN_ON(x)
+# define TRACE_BUG_ON(x)                        BUG_ON(x)
+# define TRACE_OFF()                                            \
+do {                                                            \
+        if (rt_trace_on) {                                      \
+                rt_trace_on = 0;                                \
+                console_verbose();                              \
+                if (raw_spin_is_locked(&current->pi_lock))      \
+                        raw_spin_unlock(&current->pi_lock);     \
+        }                                                       \
+} while (0)
+# define TRACE_OFF_NOLOCK()                                     \
+do {                                                            \
+        if (rt_trace_on) {                                      \
+                rt_trace_on = 0;                                \
+                console_verbose();                              \
+        }                                                       \
+} while (0)
+# define TRACE_BUG_LOCKED()                     \
+do {                                            \
+        TRACE_OFF();                            \
+        BUG();                                  \
+} while (0)
+# define TRACE_WARN_ON_LOCKED(c)                \
+do {                                            \
+        if (unlikely(c)) {                      \
+                TRACE_OFF();                    \
+                WARN_ON(1);                     \
+        }                                       \
+} while (0)
+# define TRACE_BUG_ON_LOCKED(c)                 \
+do {                                            \
+        if (unlikely(c))                        \
+                TRACE_BUG_LOCKED();             \
+} while (0)
+#ifdef CONFIG_SMP
+# define SMP_TRACE_BUG_ON_LOCKED(c)     TRACE_BUG_ON_LOCKED(c)
+#else
+# define SMP_TRACE_BUG_ON_LOCKED(c)     do { } while (0)
+#endif
+/*
+ * deadlock detection flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+static int rt_trace_on = 1;
 static void printk_task(struct task_struct *p)
 {
        if (p)
@@ -56,8 +111,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
 void rt_mutex_debug_task_free(struct task_struct *task)
 {
-        DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
+        WARN_ON(!plist_head_empty(&task->pi_waiters));
-        DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
+        WARN_ON(task->pi_blocked_on);
 }
 /*
@@ -70,7 +125,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
 {
        struct task_struct *task;
-        if (!debug_locks || detect || !act_waiter)
+        if (!rt_trace_on || detect || !act_waiter)
                return;
        task = rt_mutex_owner(act_waiter->lock);
@@ -84,7 +139,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 {
        struct task_struct *task;
-        if (!waiter->deadlock_lock || !debug_locks)
+        if (!waiter->deadlock_lock || !rt_trace_on)
                return;
        rcu_read_lock();
@@ -94,14 +149,10 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
                return;
        }
-        if (!debug_locks_off()) {
+        TRACE_OFF_NOLOCK();
-                rcu_read_unlock();
-                return;
-        }
        printk("\n============================================\n");
        printk(  "[ BUG: circular locking deadlock detected! ]\n");
-        printk("%s\n", print_tainted());
        printk(  "--------------------------------------------\n");
        printk("%s/%d is deadlocking current task %s/%d\n\n",
               task->comm, task_pid_nr(task),
@@ -129,6 +180,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
        printk("[ turning off deadlock detection."
               "Please report this trace. ]\n\n");
+        local_irq_disable();
 }
 void debug_rt_mutex_lock(struct rt_mutex *lock)
@@ -137,7 +189,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock)
 void debug_rt_mutex_unlock(struct rt_mutex *lock)
 {
-        DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
+        TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
 }
 void
@@ -147,7 +199,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
 void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
 {
-        DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
+        TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
 }
 void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -161,8 +213,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 {
        put_pid(waiter->deadlock_task_pid);
-        DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
+        TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
-        DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+        TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
        memset(waiter, 0x22, sizeof(*waiter));
 }
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 98ec4947546..5c9ccd38096 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -6,11 +6,11 @@
 *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *
 */
-#include <linux/device.h>
 #include <linux/kthread.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
+#include <linux/sysdev.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
@@ -27,7 +27,7 @@ struct test_thread_data {
        int                     opdata;
        int                     mutexes[MAX_RT_TEST_MUTEXES];
        int                     event;
-        struct device           dev;
+        struct sys_device       sysdev;
 };
 static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
@@ -271,7 +271,7 @@ static int test_func(void *data)
 *
 * opcode:data
 */
-static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
+static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
                                  const char *buf, size_t count)
 {
        struct sched_param schedpar;
@@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *a
        char cmdbuf[32];
        int op, dat, tid, ret;
-        td = container_of(dev, struct test_thread_data, dev);
+        td = container_of(dev, struct test_thread_data, sysdev);
-        tid = td->dev.id;
+        tid = td->sysdev.id;
        /* strings from sysfs write are not 0 terminated! */
        if (count >= sizeof(cmdbuf))
@@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *a
 * @dev:        thread to query
 * @buf:        char buffer to be filled with thread status info
 */
-static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
+static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
                                 char *buf)
 {
        struct test_thread_data *td;
@@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
        char *curr = buf;
        int i;
-        td = container_of(dev, struct test_thread_data, dev);
+        td = container_of(dev, struct test_thread_data, sysdev);
-        tsk = threads[td->dev.id];
+        tsk = threads[td->sysdev.id];
        spin_lock(&rttest_lock);
@@ -360,29 +360,28 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
        spin_unlock(&rttest_lock);
        curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
-                        mutexes[td->dev.id].owner);
+                        mutexes[td->sysdev.id].owner);
        return curr - buf;
 }
-static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
+static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
+static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
-static struct bus_type rttest_subsys = {
+static struct sysdev_class rttest_sysclass = {
        .name = "rttest",
-        .dev_name = "rttest",
 };
 static int init_test_thread(int id)
 {
-        thread_data[id].dev.bus = &rttest_subsys;
+        thread_data[id].sysdev.cls = &rttest_sysclass;
-        thread_data[id].dev.id = id;
+        thread_data[id].sysdev.id = id;
        threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
        if (IS_ERR(threads[id]))
                return PTR_ERR(threads[id]);
-        return device_register(&thread_data[id].dev);
+        return sysdev_register(&thread_data[id].sysdev);
 }
 static int init_rttest(void)
@@ -394,7 +393,7 @@ static int init_rttest(void)
        for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
                rt_mutex_init(&mutexes[i]);
-        ret = subsys_system_register(&rttest_subsys, NULL);
+        ret = sysdev_class_register(&rttest_sysclass);
        if (ret)
                return ret;
@@ -402,10 +401,10 @@ static int init_rttest(void)
                ret = init_test_thread(i);
                if (ret)
                        break;
-                ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
+                ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
                if (ret)
                        break;
-                ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
+                ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
                if (ret)
                        break;
        }
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a242e691c99..255e1662acd 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -11,7 +11,7 @@
 *  See Documentation/rt-mutex-design.txt for details.
 */
 #include <linux/spinlock.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/timer.h>
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b3c6c3fcd84..9f48f3d82e9 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -7,9 +7,10 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/rwsem.h>
+#include <asm/system.h>
 #include <linux/atomic.h>
 /*
@@ -116,16 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
 EXPORT_SYMBOL(down_read_nested);
-void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
-{
-        might_sleep();
-        rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
-        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
-}
-EXPORT_SYMBOL(_down_write_nest_lock);
 void down_write_nested(struct rw_semaphore *sem, int subclass)
 {
        might_sleep();
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
deleted file mode 100644
index f06d249e103..00000000000
--- a/kernel/sched/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_clock.o = -pg
-endif
-ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
-# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
-# needed for x86 only.  Why this used to be enabled for all architectures is beyond
-# me.  I suspect most platforms don't need this, but until we know that for sure
-# I turn this off for IA-64 only.  Andreas Schwab says it's also needed on m68k
-# to get a correct value for the wait-channel (WCHAN in ps). --davidm
-CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
-endif
-obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
-obj-$(CONFIG_SMP) += cpupri.o
-obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
-obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
deleted file mode 100644
index 0984a21076a..00000000000
--- a/kernel/sched/auto_group.c
+++ /dev/null
@@ -1,258 +0,0 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
-#include "sched.h"
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-#include <linux/utsname.h>
-#include <linux/security.h>
-#include <linux/export.h>
-unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
-static struct autogroup autogroup_default;
-static atomic_t autogroup_seq_nr;
-void __init autogroup_init(struct task_struct *init_task)
-{
-        autogroup_default.tg = &root_task_group;
-        kref_init(&autogroup_default.kref);
-        init_rwsem(&autogroup_default.lock);
-        init_task->signal->autogroup = &autogroup_default;
-}
-void autogroup_free(struct task_group *tg)
-{
-        kfree(tg->autogroup);
-}
-static inline void autogroup_destroy(struct kref *kref)
-{
-        struct autogroup *ag = container_of(kref, struct autogroup, kref);
-#ifdef CONFIG_RT_GROUP_SCHED
-        /* We've redirected RT tasks to the root task group... */
-        ag->tg->rt_se = NULL;
-        ag->tg->rt_rq = NULL;
-#endif
-        sched_destroy_group(ag->tg);
-}
-static inline void autogroup_kref_put(struct autogroup *ag)
-{
-        kref_put(&ag->kref, autogroup_destroy);
-}
-static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
-{
-        kref_get(&ag->kref);
-        return ag;
-}
-static inline struct autogroup *autogroup_task_get(struct task_struct *p)
-{
-        struct autogroup *ag;
-        unsigned long flags;
-        if (!lock_task_sighand(p, &flags))
-                return autogroup_kref_get(&autogroup_default);
-        ag = autogroup_kref_get(p->signal->autogroup);
-        unlock_task_sighand(p, &flags);
-        return ag;
-}
-static inline struct autogroup *autogroup_create(void)
-{
-        struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
-        struct task_group *tg;
-        if (!ag)
-                goto out_fail;
-        tg = sched_create_group(&root_task_group);
-        if (IS_ERR(tg))
-                goto out_free;
-        kref_init(&ag->kref);
-        init_rwsem(&ag->lock);
-        ag->id = atomic_inc_return(&autogroup_seq_nr);
-        ag->tg = tg;
-#ifdef CONFIG_RT_GROUP_SCHED
-        /*
-         * Autogroup RT tasks are redirected to the root task group
-         * so we don't have to move tasks around upon policy change,
-         * or flail around trying to allocate bandwidth on the fly.
-         * A bandwidth exception in __sched_setscheduler() allows
-         * the policy change to proceed.  Thereafter, task_group()
-         * returns &root_task_group, so zero bandwidth is required.
-         */
-        free_rt_sched_group(tg);
-        tg->rt_se = root_task_group.rt_se;
-        tg->rt_rq = root_task_group.rt_rq;
-#endif
-        tg->autogroup = ag;
-        return ag;
-out_free:
-        kfree(ag);
-out_fail:
-        if (printk_ratelimit()) {
-                printk(KERN_WARNING "autogroup_create: %s failure.\n",
-                        ag ? "sched_create_group()" : "kmalloc()");
-        }
-        return autogroup_kref_get(&autogroup_default);
-}
-bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
-{
-        if (tg != &root_task_group)
-                return false;
-        if (p->sched_class != &fair_sched_class)
-                return false;
-        /*
-         * We can only assume the task group can't go away on us if
-         * autogroup_move_group() can see us on ->thread_group list.
-         */
-        if (p->flags & PF_EXITING)
-                return false;
-        return true;
-}
-static void
-autogroup_move_group(struct task_struct *p, struct autogroup *ag)
-{
-        struct autogroup *prev;
-        struct task_struct *t;
-        unsigned long flags;
-        BUG_ON(!lock_task_sighand(p, &flags));
-        prev = p->signal->autogroup;
-        if (prev == ag) {
-                unlock_task_sighand(p, &flags);
-                return;
-        }
-        p->signal->autogroup = autogroup_kref_get(ag);
-        if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
-                goto out;
-        t = p;
-        do {
-                sched_move_task(t);
-        } while_each_thread(p, t);
-out:
-        unlock_task_sighand(p, &flags);
-        autogroup_kref_put(prev);
-}
-/* Allocates GFP_KERNEL, cannot be called under any spinlock */
-void sched_autogroup_create_attach(struct task_struct *p)
-{
-        struct autogroup *ag = autogroup_create();
-        autogroup_move_group(p, ag);
-        /* drop extra reference added by autogroup_create() */
-        autogroup_kref_put(ag);
-}
-EXPORT_SYMBOL(sched_autogroup_create_attach);
-/* Cannot be called under siglock.  Currently has no users */
-void sched_autogroup_detach(struct task_struct *p)
-{
-        autogroup_move_group(p, &autogroup_default);
-}
-EXPORT_SYMBOL(sched_autogroup_detach);
-void sched_autogroup_fork(struct signal_struct *sig)
-{
-        sig->autogroup = autogroup_task_get(current);
-}
-void sched_autogroup_exit(struct signal_struct *sig)
-{
-        autogroup_kref_put(sig->autogroup);
-}
-static int __init setup_autogroup(char *str)
-{
-        sysctl_sched_autogroup_enabled = 0;
-        return 1;
-}
-__setup("noautogroup", setup_autogroup);
-#ifdef CONFIG_PROC_FS
-int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
-{
-        static unsigned long next = INITIAL_JIFFIES;
-        struct autogroup *ag;
-        int err;
-        if (nice < -20 || nice > 19)
-                return -EINVAL;
-        err = security_task_setnice(current, nice);
-        if (err)
-                return err;
-        if (nice < 0 && !can_nice(current, nice))
-                return -EPERM;
-        /* this is a heavy operation taking global locks.. */
-        if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
-                return -EAGAIN;
-        next = HZ / 10 + jiffies;
-        ag = autogroup_task_get(p);
-        down_write(&ag->lock);
-        err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
-        if (!err)
-                ag->nice = nice;
-        up_write(&ag->lock);
-        autogroup_kref_put(ag);
-        return err;
-}
-void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
-{
-        struct autogroup *ag = autogroup_task_get(p);
-        if (!task_group_is_autogroup(ag->tg))
-                goto out;
-        down_read(&ag->lock);
-        seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
-        up_read(&ag->lock);
-out:
-        autogroup_kref_put(ag);
-}
-#endif /* CONFIG_PROC_FS */
-#ifdef CONFIG_SCHED_DEBUG
-int autogroup_path(struct task_group *tg, char *buf, int buflen)
-{
-        if (!task_group_is_autogroup(tg))
-                return 0;
-        return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
-}
-#endif /* CONFIG_SCHED_DEBUG */
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
deleted file mode 100644
index 8bd04714281..00000000000
--- a/kernel/sched/auto_group.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
-#include <linux/kref.h>
-#include <linux/rwsem.h>
-struct autogroup {
-        /*
-         * reference doesn't mean how many thread attach to this
-         * autogroup now. It just stands for the number of task
-         * could use this autogroup.
-         */
-        struct kref             kref;
-        struct task_group       *tg;
-        struct rw_semaphore     lock;
-        unsigned long           id;
-        int                     nice;
-};
-extern void autogroup_init(struct task_struct *init_task);
-extern void autogroup_free(struct task_group *tg);
-static inline bool task_group_is_autogroup(struct task_group *tg)
-{
-        return !!tg->autogroup;
-}
-extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
-static inline struct task_group *
-autogroup_task_group(struct task_struct *p, struct task_group *tg)
-{
-        int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
-        if (enabled && task_wants_autogroup(p, tg))
-                return p->signal->autogroup->tg;
-        return tg;
-}
-extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
-#else /* !CONFIG_SCHED_AUTOGROUP */
-static inline void autogroup_init(struct task_struct *init_task) {  }
-static inline void autogroup_free(struct task_group *tg) { }
-static inline bool task_group_is_autogroup(struct task_group *tg)
-{
-        return 0;
-}
-static inline struct task_group *
-autogroup_task_group(struct task_struct *p, struct task_group *tg)
-{
-        return tg;
-}
-#ifdef CONFIG_SCHED_DEBUG
-static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
-{
-        return 0;
-}
-#endif
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
deleted file mode 100644
index c685e31492d..00000000000
--- a/kernel/sched/clock.c
+++ /dev/null
@@ -1,350 +0,0 @@
-/*
- * sched_clock for unstable cpu clocks
- *
- *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *
- *  Updates and enhancements:
- *    Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
- *
- * Based on code by:
- *   Ingo Molnar <mingo@redhat.com>
- *   Guillaume Chazarain <guichaz@gmail.com>
- *
- *
- * What:
- *
- * cpu_clock(i) provides a fast (execution time) high resolution
- * clock with bounded drift between CPUs. The value of cpu_clock(i)
- * is monotonic for constant i. The timestamp returned is in nanoseconds.
- *
- * ######################### BIG FAT WARNING ##########################
- * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
- * # go backwards !!                                                  #
- * ####################################################################
- *
- * There is no strict promise about the base, although it tends to start
- * at 0 on boot (but people really shouldn't rely on that).
- *
- * cpu_clock(i)       -- can be used from any context, including NMI.
- * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
- * local_clock()      -- is cpu_clock() on the current cpu.
- *
- * How:
- *
- * The implementation either uses sched_clock() when
- * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
- * sched_clock() is assumed to provide these properties (mostly it means
- * the architecture provides a globally synchronized highres time source).
- *
- * Otherwise it tries to create a semi stable clock from a mixture of other
- * clocks, including:
- *
- *  - GTOD (clock monotomic)
- *  - sched_clock()
- *  - explicit idle events
- *
- * We use GTOD as base and use sched_clock() deltas to improve resolution. The
- * deltas are filtered to provide monotonicity and keeping it within an
- * expected window.
- *
- * Furthermore, explicit sleep and wakeup hooks allow us to account for time
- * that is otherwise invisible (TSC gets stopped).
- *
- *
- * Notes:
- *
- * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
- * like cpufreq interrupts that can change the base clock (TSC) multiplier
- * and cause funny jumps in time -- although the filtering provided by
- * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
- * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
- * sched_clock().
- */
-#include <linux/spinlock.h>
-#include <linux/hardirq.h>
-#include <linux/export.h>
-#include <linux/percpu.h>
-#include <linux/ktime.h>
-#include <linux/sched.h>
-/*
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
-{
-        return (unsigned long long)(jiffies - INITIAL_JIFFIES)
-                                        * (NSEC_PER_SEC / HZ);
-}
-EXPORT_SYMBOL_GPL(sched_clock);
-__read_mostly int sched_clock_running;
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-__read_mostly int sched_clock_stable;
-struct sched_clock_data {
-        u64                     tick_raw;
-        u64                     tick_gtod;
-        u64                     clock;
-};
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-static inline struct sched_clock_data *this_scd(void)
-{
-        return &__get_cpu_var(sched_clock_data);
-}
-static inline struct sched_clock_data *cpu_sdc(int cpu)
-{
-        return &per_cpu(sched_clock_data, cpu);
-}
-void sched_clock_init(void)
-{
-        u64 ktime_now = ktime_to_ns(ktime_get());
-        int cpu;
-        for_each_possible_cpu(cpu) {
-                struct sched_clock_data *scd = cpu_sdc(cpu);
-                scd->tick_raw = 0;
-                scd->tick_gtod = ktime_now;
-                scd->clock = ktime_now;
-        }
-        sched_clock_running = 1;
-}
-/*
- * min, max except they take wrapping into account
- */
-static inline u64 wrap_min(u64 x, u64 y)
-{
-        return (s64)(x - y) < 0 ? x : y;
-}
-static inline u64 wrap_max(u64 x, u64 y)
-{
-        return (s64)(x - y) > 0 ? x : y;
-}
-/*
- * update the percpu scd from the raw @now value
- *
- *  - filter out backward motion
- *  - use the GTOD tick value to create a window to filter crazy TSC values
- */
-static u64 sched_clock_local(struct sched_clock_data *scd)
-{
-        u64 now, clock, old_clock, min_clock, max_clock;
-        s64 delta;
-again:
-        now = sched_clock();
-        delta = now - scd->tick_raw;
-        if (unlikely(delta < 0))
-                delta = 0;
-        old_clock = scd->clock;
-        /*
-         * scd->clock = clamp(scd->tick_gtod + delta,
-         *                    max(scd->tick_gtod, scd->clock),
-         *                    scd->tick_gtod + TICK_NSEC);
-         */
-        clock = scd->tick_gtod + delta;
-        min_clock = wrap_max(scd->tick_gtod, old_clock);
-        max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
-        clock = wrap_max(clock, min_clock);
-        clock = wrap_min(clock, max_clock);
-        if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
-                goto again;
-        return clock;
-}
-static u64 sched_clock_remote(struct sched_clock_data *scd)
-{
-        struct sched_clock_data *my_scd = this_scd();
-        u64 this_clock, remote_clock;
-        u64 *ptr, old_val, val;
-        sched_clock_local(my_scd);
-again:
-        this_clock = my_scd->clock;
-        remote_clock = scd->clock;
-        /*
-         * Use the opportunity that we have both locks
-         * taken to couple the two clocks: we take the
-         * larger time as the latest time for both
-         * runqueues. (this creates monotonic movement)
-         */
-        if (likely((s64)(remote_clock - this_clock) < 0)) {
-                ptr = &scd->clock;
-                old_val = remote_clock;
-                val = this_clock;
-        } else {
-                /*
-                 * Should be rare, but possible:
-                 */
-                ptr = &my_scd->clock;
-                old_val = this_clock;
-                val = remote_clock;
-        }
-        if (cmpxchg64(ptr, old_val, val) != old_val)
-                goto again;
-        return val;
-}
-/*
- * Similar to cpu_clock(), but requires local IRQs to be disabled.
- *
- * See cpu_clock().
- */
-u64 sched_clock_cpu(int cpu)
-{
-        struct sched_clock_data *scd;
-        u64 clock;
-        WARN_ON_ONCE(!irqs_disabled());
-        if (sched_clock_stable)
-                return sched_clock();
-        if (unlikely(!sched_clock_running))
-                return 0ull;
-        scd = cpu_sdc(cpu);
-        if (cpu != smp_processor_id())
-                clock = sched_clock_remote(scd);
-        else
-                clock = sched_clock_local(scd);
-        return clock;
-}
-void sched_clock_tick(void)
-{
-        struct sched_clock_data *scd;
-        u64 now, now_gtod;
-        if (sched_clock_stable)
-                return;
-        if (unlikely(!sched_clock_running))
-                return;
-        WARN_ON_ONCE(!irqs_disabled());
-        scd = this_scd();
-        now_gtod = ktime_to_ns(ktime_get());
-        now = sched_clock();
-        scd->tick_raw = now;
-        scd->tick_gtod = now_gtod;
-        sched_clock_local(scd);
-}
-/*
- * We are going deep-idle (irqs are disabled):
- */
-void sched_clock_idle_sleep_event(void)
-{
-        sched_clock_cpu(smp_processor_id());
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
-/*
- * We just idled delta nanoseconds (called with irqs disabled):
- */
-void sched_clock_idle_wakeup_event(u64 delta_ns)
-{
-        if (timekeeping_suspended)
-                return;
-        sched_clock_tick();
-        touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-/*
- * As outlined at the top, provides a fast, high resolution, nanosecond
- * time source that is monotonic per cpu argument and has bounded drift
- * between cpus.
- *
- * ######################### BIG FAT WARNING ##########################
- * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
- * # go backwards !!                                                  #
- * ####################################################################
- */
-u64 cpu_clock(int cpu)
-{
-        u64 clock;
-        unsigned long flags;
-        local_irq_save(flags);
-        clock = sched_clock_cpu(cpu);
-        local_irq_restore(flags);
-        return clock;
-}
-/*
- * Similar to cpu_clock() for the current cpu. Time will only be observed
- * to be monotonic if care is taken to only compare timestampt taken on the
- * same CPU.
- *
- * See cpu_clock().
- */
-u64 local_clock(void)
-{
-        u64 clock;
-        unsigned long flags;
-        local_irq_save(flags);
-        clock = sched_clock_cpu(smp_processor_id());
-        local_irq_restore(flags);
-        return clock;
-}
-#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-void sched_clock_init(void)
-{
-        sched_clock_running = 1;
-}
-u64 sched_clock_cpu(int cpu)
-{
-        if (unlikely(!sched_clock_running))
-                return 0;
-        return sched_clock();
-}
-u64 cpu_clock(int cpu)
-{
-        return sched_clock_cpu(cpu);
-}
-u64 local_clock(void)
-{
-        return sched_clock_cpu(0);
-}
-#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-EXPORT_SYMBOL_GPL(cpu_clock);
-EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
deleted file mode 100644
index 257002c13bb..00000000000
--- a/kernel/sched/core.c
+++ /dev/null
@@ -1,8162 +0,0 @@
-/*
- *  kernel/sched/core.c
- *
- *  Kernel scheduler and related syscalls
- *
- *  Copyright (C) 1991-2002  Linus Torvalds
- *
- *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
- *              make semaphores SMP safe
- *  1998-11-19  Implemented schedule_timeout() and related stuff
- *              by Andrea Arcangeli
- *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
- *              hybrid priority-list and round-robin design with
- *              an array-switch method of distributing timeslices
- *              and per-CPU runqueues.  Cleanups and useful suggestions
- *              by Davide Libenzi, preemptible kernel bits by Robert Love.
- *  2003-09-03  Interactivity tuning by Con Kolivas.
- *  2004-04-02  Scheduler domains code by Nick Piggin
- *  2007-04-15  Work begun on replacing all interactivity tuning with a
- *              fair scheduling design by Con Kolivas.
- *  2007-05-05  Load balancing (smp-nice) and other improvements
- *              by Peter Williams
- *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
- *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
- *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
- *              Thomas Gleixner, Mike Kravetz
- */
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/init.h>
-#include <linux/uaccess.h>
-#include <linux/highmem.h>
-#include <asm/mmu_context.h>
-#include <linux/interrupt.h>
-#include <linux/capability.h>
-#include <linux/completion.h>
-#include <linux/kernel_stat.h>
-#include <linux/debug_locks.h>
-#include <linux/perf_event.h>
-#include <linux/security.h>
-#include <linux/notifier.h>
-#include <linux/profile.h>
-#include <linux/freezer.h>
-#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
-#include <linux/delay.h>
-#include <linux/pid_namespace.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-#include <linux/timer.h>
-#include <linux/rcupdate.h>
-#include <linux/cpu.h>
-#include <linux/cpuset.h>
-#include <linux/percpu.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/sysctl.h>
-#include <linux/syscalls.h>
-#include <linux/times.h>
-#include <linux/tsacct_kern.h>
-#include <linux/kprobes.h>
-#include <linux/delayacct.h>
-#include <linux/unistd.h>
-#include <linux/pagemap.h>
-#include <linux/hrtimer.h>
-#include <linux/tick.h>
-#include <linux/debugfs.h>
-#include <linux/ctype.h>
-#include <linux/ftrace.h>
-#include <linux/slab.h>
-#include <linux/init_task.h>
-#include <linux/binfmts.h>
-#include <linux/context_tracking.h>
-#include <asm/switch_to.h>
-#include <asm/tlb.h>
-#include <asm/irq_regs.h>
-#include <asm/mutex.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
-#include "sched.h"
-#include "../workqueue_sched.h"
-#include "../smpboot.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
-        unsigned long delta;
-        ktime_t soft, hard, now;
-        for (;;) {
-                if (hrtimer_active(period_timer))
-                        break;
-                now = hrtimer_cb_get_time(period_timer);
-                hrtimer_forward(period_timer, now, period);
-                soft = hrtimer_get_softexpires(period_timer);
-                hard = hrtimer_get_expires(period_timer);
-                delta = ktime_to_ns(ktime_sub(hard, soft));
-                __hrtimer_start_range_ns(period_timer, soft, delta,
-                                         HRTIMER_MODE_ABS_PINNED, 0);
-        }
-}
-DEFINE_MUTEX(sched_domains_mutex);
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static void update_rq_clock_task(struct rq *rq, s64 delta);
-void update_rq_clock(struct rq *rq)
-{
-        s64 delta;
-        if (rq->skip_clock_update > 0)
-                return;
-        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
-        rq->clock += delta;
-        update_rq_clock_task(rq, delta);
-}
-/*
- * Debugging: various feature bits
- */
-#define SCHED_FEAT(name, enabled)       \
-        (1UL << __SCHED_FEAT_##name) * enabled |
-const_debug unsigned int sysctl_sched_features =
-#include "features.h"
-        0;
-#undef SCHED_FEAT
-#ifdef CONFIG_SCHED_DEBUG
-#define SCHED_FEAT(name, enabled)       \
-        #name ,
-static const char * const sched_feat_names[] = {
-#include "features.h"
-};
-#undef SCHED_FEAT
-static int sched_feat_show(struct seq_file *m, void *v)
-{
-        int i;
-        for (i = 0; i < __SCHED_FEAT_NR; i++) {
-                if (!(sysctl_sched_features & (1UL << i)))
-                        seq_puts(m, "NO_");
-                seq_printf(m, "%s ", sched_feat_names[i]);
-        }
-        seq_puts(m, "\n");
-        return 0;
-}
-#ifdef HAVE_JUMP_LABEL
-#define jump_label_key__true  STATIC_KEY_INIT_TRUE
-#define jump_label_key__false STATIC_KEY_INIT_FALSE
-#define SCHED_FEAT(name, enabled)       \
-        jump_label_key__##enabled ,
-struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
-#include "features.h"
-};
-#undef SCHED_FEAT
-static void sched_feat_disable(int i)
-{
-        if (static_key_enabled(&sched_feat_keys[i]))
-                static_key_slow_dec(&sched_feat_keys[i]);
-}
-static void sched_feat_enable(int i)
-{
-        if (!static_key_enabled(&sched_feat_keys[i]))
-                static_key_slow_inc(&sched_feat_keys[i]);
-}
-#else
-static void sched_feat_disable(int i) { };
-static void sched_feat_enable(int i) { };
-#endif /* HAVE_JUMP_LABEL */
-static int sched_feat_set(char *cmp)
-{
-        int i;
-        int neg = 0;
-        if (strncmp(cmp, "NO_", 3) == 0) {
-                neg = 1;
-                cmp += 3;
-        }
-        for (i = 0; i < __SCHED_FEAT_NR; i++) {
-                if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                        if (neg) {
-                                sysctl_sched_features &= ~(1UL << i);
-                                sched_feat_disable(i);
-                        } else {
-                                sysctl_sched_features |= (1UL << i);
-                                sched_feat_enable(i);
-                        }
-                        break;
-                }
-        }
-        return i;
-}
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
-                size_t cnt, loff_t *ppos)
-{
-        char buf[64];
-        char *cmp;
-        int i;
-        if (cnt > 63)
-                cnt = 63;
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        cmp = strstrip(buf);
-        i = sched_feat_set(cmp);
-        if (i == __SCHED_FEAT_NR)
-                return -EINVAL;
-        *ppos += cnt;
-        return cnt;
-}
-static int sched_feat_open(struct inode *inode, struct file *filp)
-{
-        return single_open(filp, sched_feat_show, NULL);
-}
-static const struct file_operations sched_feat_fops = {
-        .open           = sched_feat_open,
-        .write          = sched_feat_write,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static __init int sched_init_debug(void)
-{
-        debugfs_create_file("sched_features", 0644, NULL, NULL,
-                        &sched_feat_fops);
-        return 0;
-}
-late_initcall(sched_init_debug);
-#endif /* CONFIG_SCHED_DEBUG */
-/*
- * Number of tasks to iterate in a single balance run.
- * Limited because this is done with IRQs disabled.
- */
-const_debug unsigned int sysctl_sched_nr_migrate = 32;
-/*
- * period over which we average the RT time consumption, measured
- * in ms.
- *
- * default: 1s
- */
-const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
-/*
- * period over which we measure -rt task cpu usage in us.
- * default: 1s
- */
-unsigned int sysctl_sched_rt_period = 1000000;
-__read_mostly int scheduler_running;
-/*
- * part of the period that we allow rt tasks to run in us.
- * default: 0.95s
- */
-int sysctl_sched_rt_runtime = 950000;
-/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
-        __acquires(rq->lock)
-{
-        struct rq *rq;
-        lockdep_assert_held(&p->pi_lock);
-        for (;;) {
-                rq = task_rq(p);
-                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p)))
-                        return rq;
-                raw_spin_unlock(&rq->lock);
-        }
-}
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
-        __acquires(p->pi_lock)
-        __acquires(rq->lock)
-{
-        struct rq *rq;
-        for (;;) {
-                raw_spin_lock_irqsave(&p->pi_lock, *flags);
-                rq = task_rq(p);
-                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p)))
-                        return rq;
-                raw_spin_unlock(&rq->lock);
-                raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-        }
-}
-static void __task_rq_unlock(struct rq *rq)
-        __releases(rq->lock)
-{
-        raw_spin_unlock(&rq->lock);
-}
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
-        __releases(rq->lock)
-        __releases(p->pi_lock)
-{
-        raw_spin_unlock(&rq->lock);
-        raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
-/*
- * this_rq_lock - lock this runqueue and disable interrupts.
- */
-static struct rq *this_rq_lock(void)
-        __acquires(rq->lock)
-{
-        struct rq *rq;
-        local_irq_disable();
-        rq = this_rq();
-        raw_spin_lock(&rq->lock);
-        return rq;
-}
-#ifdef CONFIG_SCHED_HRTICK
-/*
- * Use HR-timers to deliver accurate preemption points.
- *
- * Its all a bit involved since we cannot program an hrt while holding the
- * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
- * reschedule event.
- *
- * When we get rescheduled we reprogram the hrtick_timer outside of the
- * rq->lock.
- */
-static void hrtick_clear(struct rq *rq)
-{
-        if (hrtimer_active(&rq->hrtick_timer))
-                hrtimer_cancel(&rq->hrtick_timer);
-}
-/*
- * High-resolution timer tick.
- * Runs from hardirq context with interrupts disabled.
- */
-static enum hrtimer_restart hrtick(struct hrtimer *timer)
-{
-        struct rq *rq = container_of(timer, struct rq, hrtick_timer);
-        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-        raw_spin_lock(&rq->lock);
-        update_rq_clock(rq);
-        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
-        raw_spin_unlock(&rq->lock);
-        return HRTIMER_NORESTART;
-}
-#ifdef CONFIG_SMP
-/*
- * called from hardirq (IPI) context
- */
-static void __hrtick_start(void *arg)
-{
-        struct rq *rq = arg;
-        raw_spin_lock(&rq->lock);
-        hrtimer_restart(&rq->hrtick_timer);
-        rq->hrtick_csd_pending = 0;
-        raw_spin_unlock(&rq->lock);
-}
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-void hrtick_start(struct rq *rq, u64 delay)
-{
-        struct hrtimer *timer = &rq->hrtick_timer;
-        ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
-        hrtimer_set_expires(timer, time);
-        if (rq == this_rq()) {
-                hrtimer_restart(timer);
-        } else if (!rq->hrtick_csd_pending) {
-                __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
-                rq->hrtick_csd_pending = 1;
-        }
-}
-static int
-hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-        int cpu = (int)(long)hcpu;
-        switch (action) {
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-        case CPU_DOWN_PREPARE:
-        case CPU_DOWN_PREPARE_FROZEN:
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                hrtick_clear(cpu_rq(cpu));
-                return NOTIFY_OK;
-        }
-        return NOTIFY_DONE;
-}
-static __init void init_hrtick(void)
-{
-        hotcpu_notifier(hotplug_hrtick, 0);
-}
-#else
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-void hrtick_start(struct rq *rq, u64 delay)
-{
-        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-                        HRTIMER_MODE_REL_PINNED, 0);
-}
-static inline void init_hrtick(void)
-{
-}
-#endif /* CONFIG_SMP */
-static void init_rq_hrtick(struct rq *rq)
-{
-#ifdef CONFIG_SMP
-        rq->hrtick_csd_pending = 0;
-        rq->hrtick_csd.flags = 0;
-        rq->hrtick_csd.func = __hrtick_start;
-        rq->hrtick_csd.info = rq;
-#endif
-        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        rq->hrtick_timer.function = hrtick;
-}
-#else   /* CONFIG_SCHED_HRTICK */
-static inline void hrtick_clear(struct rq *rq)
-{
-}
-static inline void init_rq_hrtick(struct rq *rq)
-{
-}
-static inline void init_hrtick(void)
-{
-}
-#endif  /* CONFIG_SCHED_HRTICK */
-/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
- */
-#ifdef CONFIG_SMP
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) 0
-#endif
-void resched_task(struct task_struct *p)
-{
-        int cpu;
-        assert_raw_spin_locked(&task_rq(p)->lock);
-        if (test_tsk_need_resched(p))
-                return;
-        set_tsk_need_resched(p);
-        cpu = task_cpu(p);
-        if (cpu == smp_processor_id())
-                return;
-        /* NEED_RESCHED must be visible before we test polling */
-        smp_mb();
-        if (!tsk_is_polling(p))
-                smp_send_reschedule(cpu);
-}
-void resched_cpu(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long flags;
-        if (!raw_spin_trylock_irqsave(&rq->lock, flags))
-                return;
-        resched_task(cpu_curr(cpu));
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-#ifdef CONFIG_NO_HZ
-/*
- * In the semi idle case, use the nearest busy cpu for migrating timers
- * from an idle cpu.  This is good for power-savings.
- *
- * We don't do similar optimization for completely idle system, as
- * selecting an idle cpu will add more delays to the timers than intended
- * (as that cpu's timer base may not be uptodate wrt jiffies etc).
- */
-int get_nohz_timer_target(void)
-{
-        int cpu = smp_processor_id();
-        int i;
-        struct sched_domain *sd;
-        rcu_read_lock();
-        for_each_domain(cpu, sd) {
-                for_each_cpu(i, sched_domain_span(sd)) {
-                        if (!idle_cpu(i)) {
-                                cpu = i;
-                                goto unlock;
-                        }
-                }
-        }
-unlock:
-        rcu_read_unlock();
-        return cpu;
-}
-/*
- * When add_timer_on() enqueues a timer into the timer wheel of an
- * idle CPU then this timer might expire before the next timer event
- * which is scheduled to wake up that CPU. In case of a completely
- * idle system the next event might even be infinite time into the
- * future. wake_up_idle_cpu() ensures that the CPU is woken up and
- * leaves the inner idle loop so the newly added timer is taken into
- * account when the CPU goes back to idle and evaluates the timer
- * wheel for the next timer event.
- */
-void wake_up_idle_cpu(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        if (cpu == smp_processor_id())
-                return;
-        /*
-         * This is safe, as this function is called with the timer
-         * wheel base lock of (cpu) held. When the CPU is on the way
-         * to idle and has not yet set rq->curr to idle then it will
-         * be serialized on the timer wheel base lock and take the new
-         * timer into account automatically.
-         */
-        if (rq->curr != rq->idle)
-                return;
-        /*
-         * We can set TIF_RESCHED on the idle task of the other CPU
-         * lockless. The worst case is that the other CPU runs the
-         * idle task through an additional NOOP schedule()
-         */
-        set_tsk_need_resched(rq->idle);
-        /* NEED_RESCHED must be visible before we test polling */
-        smp_mb();
-        if (!tsk_is_polling(rq->idle))
-                smp_send_reschedule(cpu);
-}
-static inline bool got_nohz_idle_kick(void)
-{
-        int cpu = smp_processor_id();
-        return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
-}
-#else /* CONFIG_NO_HZ */
-static inline bool got_nohz_idle_kick(void)
-{
-        return false;
-}
-#endif /* CONFIG_NO_HZ */
-void sched_avg_update(struct rq *rq)
-{
-        s64 period = sched_avg_period();
-        while ((s64)(rq->clock - rq->age_stamp) > period) {
-                /*
-                 * Inline assembly required to prevent the compiler
-                 * optimising this loop into a divmod call.
-                 * See __iter_div_u64_rem() for another example of this.
-                 */
-                asm("" : "+rm" (rq->age_stamp));
-                rq->age_stamp += period;
-                rq->rt_avg /= 2;
-        }
-}
-#else /* !CONFIG_SMP */
-void resched_task(struct task_struct *p)
-{
-        assert_raw_spin_locked(&task_rq(p)->lock);
-        set_tsk_need_resched(p);
-}
-#endif /* CONFIG_SMP */
-#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
-                        (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
-/*
- * Iterate task_group tree rooted at *from, calling @down when first entering a
- * node and @up when leaving it for the final time.
- *
- * Caller must hold rcu_lock or sufficient equivalent.
- */
-int walk_tg_tree_from(struct task_group *from,
-                             tg_visitor down, tg_visitor up, void *data)
-{
-        struct task_group *parent, *child;
-        int ret;
-        parent = from;
-down:
-        ret = (*down)(parent, data);
-        if (ret)
-                goto out;
-        list_for_each_entry_rcu(child, &parent->children, siblings) {
-                parent = child;
-                goto down;
-up:
-                continue;
-        }
-        ret = (*up)(parent, data);
-        if (ret || parent == from)
-                goto out;
-        child = parent;
-        parent = parent->parent;
-        if (parent)
-                goto up;
-out:
-        return ret;
-}
-int tg_nop(struct task_group *tg, void *data)
-{
-        return 0;
-}
-#endif
-static void set_load_weight(struct task_struct *p)
-{
-        int prio = p->static_prio - MAX_RT_PRIO;
-        struct load_weight *load = &p->se.load;
-        /*
-         * SCHED_IDLE tasks get minimal weight:
-         */
-        if (p->policy == SCHED_IDLE) {
-                load->weight = scale_load(WEIGHT_IDLEPRIO);
-                load->inv_weight = WMULT_IDLEPRIO;
-                return;
-        }
-        load->weight = scale_load(prio_to_weight[prio]);
-        load->inv_weight = prio_to_wmult[prio];
-}
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
-{
-        update_rq_clock(rq);
-        sched_info_queued(p);
-        p->sched_class->enqueue_task(rq, p, flags);
-}
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
-{
-        update_rq_clock(rq);
-        sched_info_dequeued(p);
-        p->sched_class->dequeue_task(rq, p, flags);
-}
-void activate_task(struct rq *rq, struct task_struct *p, int flags)
-{
-        if (task_contributes_to_load(p))
-                rq->nr_uninterruptible--;
-        enqueue_task(rq, p, flags);
-}
-void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
-{
-        if (task_contributes_to_load(p))
-                rq->nr_uninterruptible++;
-        dequeue_task(rq, p, flags);
-}
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-        s64 steal = 0, irq_delta = 0;
-#endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
-        /*
-         * Since irq_time is only updated on {soft,}irq_exit, we might run into
-         * this case when a previous update_rq_clock() happened inside a
-         * {soft,}irq region.
-         *
-         * When this happens, we stop ->clock_task and only update the
-         * prev_irq_time stamp to account for the part that fit, so that a next
-         * update will consume the rest. This ensures ->clock_task is
-         * monotonic.
-         *
-         * It does however cause some slight miss-attribution of {soft,}irq
-         * time, a more accurate solution would be to update the irq_time using
-         * the current rq->clock timestamp, except that would require using
-         * atomic ops.
-         */
-        if (irq_delta > delta)
-                irq_delta = delta;
-        rq->prev_irq_time += irq_delta;
-        delta -= irq_delta;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-        if (static_key_false((&paravirt_steal_rq_enabled))) {
-                u64 st;
-                steal = paravirt_steal_clock(cpu_of(rq));
-                steal -= rq->prev_steal_time_rq;
-                if (unlikely(steal > delta))
-                        steal = delta;
-                st = steal_ticks(steal);
-                steal = st * TICK_NSEC;
-                rq->prev_steal_time_rq += steal;
-                delta -= steal;
-        }
-#endif
-        rq->clock_task += delta;
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-        if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
-                sched_rt_avg_update(rq, irq_delta + steal);
-#endif
-}
-void sched_set_stop_task(int cpu, struct task_struct *stop)
-{
-        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
-        struct task_struct *old_stop = cpu_rq(cpu)->stop;
-        if (stop) {
-                /*
-                 * Make it appear like a SCHED_FIFO task, its something
-                 * userspace knows about and won't get confused about.
-                 *
-                 * Also, it will make PI more or less work without too
-                 * much confusion -- but then, stop work should not
-                 * rely on PI working anyway.
-                 */
-                sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
-                stop->sched_class = &stop_sched_class;
-        }
-        cpu_rq(cpu)->stop = stop;
-        if (old_stop) {
-                /*
-                 * Reset it back to a normal scheduling class so that
-                 * it can die in pieces.
-                 */
-                old_stop->sched_class = &rt_sched_class;
-        }
-}
-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
-{
-        return p->static_prio;
-}
-/*
- * Calculate the expected normal priority: i.e. priority
- * without taking RT-inheritance into account. Might be
- * boosted by interactivity modifiers. Changes upon fork,
- * setprio syscalls, and whenever the interactivity
- * estimator recalculates.
- */
-static inline int normal_prio(struct task_struct *p)
-{
-        int prio;
-        if (task_has_rt_policy(p))
-                prio = MAX_RT_PRIO-1 - p->rt_priority;
-        else
-                prio = __normal_prio(p);
-        return prio;
-}
-/*
- * Calculate the current priority, i.e. the priority
- * taken into account by the scheduler. This value might
- * be boosted by RT tasks, or might be boosted by
- * interactivity modifiers. Will be RT if the task got
- * RT-boosted. If not then it returns p->normal_prio.
- */
-static int effective_prio(struct task_struct *p)
-{
-        p->normal_prio = normal_prio(p);
-        /*
-         * If we are RT tasks or we were boosted to RT priority,
-         * keep the priority unchanged. Otherwise, update priority
-         * to the normal priority:
-         */
-        if (!rt_prio(p->prio))
-                return p->normal_prio;
-        return p->prio;
-}
-/**
- * task_curr - is this task currently executing on a CPU?
- * @p: the task in question.
- */
-inline int task_curr(const struct task_struct *p)
-{
-        return cpu_curr(task_cpu(p)) == p;
-}
-static inline void check_class_changed(struct rq *rq, struct task_struct *p,
-                                       const struct sched_class *prev_class,
-                                       int oldprio)
-{
-        if (prev_class != p->sched_class) {
-                if (prev_class->switched_from)
-                        prev_class->switched_from(rq, p);
-                p->sched_class->switched_to(rq, p);
-        } else if (oldprio != p->prio)
-                p->sched_class->prio_changed(rq, p, oldprio);
-}
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
-{
-        const struct sched_class *class;
-        if (p->sched_class == rq->curr->sched_class) {
-                rq->curr->sched_class->check_preempt_curr(rq, p, flags);
-        } else {
-                for_each_class(class) {
-                        if (class == rq->curr->sched_class)
-                                break;
-                        if (class == p->sched_class) {
-                                resched_task(rq->curr);
-                                break;
-                        }
-                }
-        }
-        /*
-         * A queue event has occurred, and we're going to schedule.  In
-         * this case, we can save a useless back to back clock update.
-         */
-        if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
-                rq->skip_clock_update = 1;
-}
-static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
-void register_task_migration_notifier(struct notifier_block *n)
-{
-        atomic_notifier_chain_register(&task_migration_notifier, n);
-}
-#ifdef CONFIG_SMP
-void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
-{
-#ifdef CONFIG_SCHED_DEBUG
-        /*
-         * We should never call set_task_cpu() on a blocked task,
-         * ttwu() will sort out the placement.
-         */
-        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
-#ifdef CONFIG_LOCKDEP
-        /*
-         * The caller should hold either p->pi_lock or rq->lock, when changing
-         * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
-         *
-         * sched_move_task() holds both and thus holding either pins the cgroup,
-         * see task_group().
-         *
-         * Furthermore, all task_rq users should acquire both locks, see
-         * task_rq_lock().
-         */
-        WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
-                                      lockdep_is_held(&task_rq(p)->lock)));
-#endif
-#endif
-        trace_sched_migrate_task(p, new_cpu);
-        if (task_cpu(p) != new_cpu) {
-                struct task_migration_notifier tmn;
-                if (p->sched_class->migrate_task_rq)
-                        p->sched_class->migrate_task_rq(p, new_cpu);
-                p->se.nr_migrations++;
-                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
-                tmn.task = p;
-                tmn.from_cpu = task_cpu(p);
-                tmn.to_cpu = new_cpu;
-                atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
-        }
-        __set_task_cpu(p, new_cpu);
-}
-struct migration_arg {
-        struct task_struct *task;
-        int dest_cpu;
-};
-static int migration_cpu_stop(void *data);
-/*
- * wait_task_inactive - wait for a thread to unschedule.
- *
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change.  If it changes, i.e. @p might have woken up,
- * then return zero.  When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count).  If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
- *
- * The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time. This function can't
- * be called with interrupts off, or it may introduce deadlock with
- * smp_call_function() if an IPI is sent by the same process we are
- * waiting to become inactive.
- */
-unsigned long wait_task_inactive(struct task_struct *p, long match_state)
-{
-        unsigned long flags;
-        int running, on_rq;
-        unsigned long ncsw;
-        struct rq *rq;
-        for (;;) {
-                /*
-                 * We do the initial early heuristics without holding
-                 * any task-queue locks at all. We'll only try to get
-                 * the runqueue lock when things look like they will
-                 * work out!
-                 */
-                rq = task_rq(p);
-                /*
-                 * If the task is actively running on another CPU
-                 * still, just relax and busy-wait without holding
-                 * any locks.
-                 *
-                 * NOTE! Since we don't hold any locks, it's not
-                 * even sure that "rq" stays as the right runqueue!
-                 * But we don't care, since "task_running()" will
-                 * return false if the runqueue has changed and p
-                 * is actually now running somewhere else!
-                 */
-                while (task_running(rq, p)) {
-                        if (match_state && unlikely(p->state != match_state))
-                                return 0;
-                        cpu_relax();
-                }
-                /*
-                 * Ok, time to look more closely! We need the rq
-                 * lock now, to be *sure*. If we're wrong, we'll
-                 * just go back and repeat.
-                 */
-                rq = task_rq_lock(p, &flags);
-                trace_sched_wait_task(p);
-                running = task_running(rq, p);
-                on_rq = p->on_rq;
-                ncsw = 0;
-                if (!match_state || p->state == match_state)
-                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-                task_rq_unlock(rq, p, &flags);
-                /*
-                 * If it changed from the expected state, bail out now.
-                 */
-                if (unlikely(!ncsw))
-                        break;
-                /*
-                 * Was it really running after all now that we
-                 * checked with the proper locks actually held?
-                 *
-                 * Oops. Go back and try again..
-                 */
-                if (unlikely(running)) {
-                        cpu_relax();
-                        continue;
-                }
-                /*
-                 * It's not enough that it's not actively running,
-                 * it must be off the runqueue _entirely_, and not
-                 * preempted!
-                 *
-                 * So if it was still runnable (but just not actively
-                 * running right now), it's preempted, and we should
-                 * yield - it could be a while.
-                 */
-                if (unlikely(on_rq)) {
-                        ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
-                        set_current_state(TASK_UNINTERRUPTIBLE);
-                        schedule_hrtimeout(&to, HRTIMER_MODE_REL);
-                        continue;
-                }
-                /*
-                 * Ahh, all good. It wasn't running, and it wasn't
-                 * runnable, which means that it will never become
-                 * running in the future either. We're all done!
-                 */
-                break;
-        }
-        return ncsw;
-}
-/***
- * kick_process - kick a running thread to enter/exit the kernel
- * @p: the to-be-kicked thread
- *
- * Cause a process which is running on another CPU to enter
- * kernel-mode, without any delay. (to get signals handled.)
- *
- * NOTE: this function doesn't have to take the runqueue lock,
- * because all it wants to ensure is that the remote task enters
- * the kernel. If the IPI races and the task has been migrated
- * to another CPU then no harm is done and the purpose has been
- * achieved as well.
- */
-void kick_process(struct task_struct *p)
-{
-        int cpu;
-        preempt_disable();
-        cpu = task_cpu(p);
-        if ((cpu != smp_processor_id()) && task_curr(p))
-                smp_send_reschedule(cpu);
-        preempt_enable();
-}
-EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_SMP
-/*
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
- */
-static int select_fallback_rq(int cpu, struct task_struct *p)
-{
-        const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
-        enum { cpuset, possible, fail } state = cpuset;
-        int dest_cpu;
-        /* Look for allowed, online CPU in same node. */
-        for_each_cpu(dest_cpu, nodemask) {
-                if (!cpu_online(dest_cpu))
-                        continue;
-                if (!cpu_active(dest_cpu))
-                        continue;
-                if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-                        return dest_cpu;
-        }
-        for (;;) {
-                /* Any allowed, online CPU? */
-                for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-                        if (!cpu_online(dest_cpu))
-                                continue;
-                        if (!cpu_active(dest_cpu))
-                                continue;
-                        goto out;
-                }
-                switch (state) {
-                case cpuset:
-                        /* No more Mr. Nice Guy. */
-                        cpuset_cpus_allowed_fallback(p);
-                        state = possible;
-                        break;
-                case possible:
-                        do_set_cpus_allowed(p, cpu_possible_mask);
-                        state = fail;
-                        break;
-                case fail:
-                        BUG();
-                        break;
-                }
-        }
-out:
-        if (state != cpuset) {
-                /*
-                 * Don't tell them about moving exiting tasks or
-                 * kernel threads (both mm NULL), since they never
-                 * leave kernel.
-                 */
-                if (p->mm && printk_ratelimit()) {
-                        printk_sched("process %d (%s) no longer affine to cpu%d\n",
-                                        task_pid_nr(p), p->comm, cpu);
-                }
-        }
-        return dest_cpu;
-}
-/*
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
- */
-static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
-{
-        int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
-        /*
-         * In order not to call set_task_cpu() on a blocking task we need
-         * to rely on ttwu() to place the task on a valid ->cpus_allowed
-         * cpu.
-         *
-         * Since this is common to all placement strategies, this lives here.
-         *
-         * [ this allows ->select_task() to simply return task_cpu(p) and
-         *   not worry about this generic constraint ]
-         */
-        if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
-                     !cpu_online(cpu)))
-                cpu = select_fallback_rq(task_cpu(p), p);
-        return cpu;
-}
-static void update_avg(u64 *avg, u64 sample)
-{
-        s64 diff = sample - *avg;
-        *avg += diff >> 3;
-}
-#endif
-static void
-ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
-{
-#ifdef CONFIG_SCHEDSTATS
-        struct rq *rq = this_rq();
-#ifdef CONFIG_SMP
-        int this_cpu = smp_processor_id();
-        if (cpu == this_cpu) {
-                schedstat_inc(rq, ttwu_local);
-                schedstat_inc(p, se.statistics.nr_wakeups_local);
-        } else {
-                struct sched_domain *sd;
-                schedstat_inc(p, se.statistics.nr_wakeups_remote);
-                rcu_read_lock();
-                for_each_domain(this_cpu, sd) {
-                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                                schedstat_inc(sd, ttwu_wake_remote);
-                                break;
-                        }
-                }
-                rcu_read_unlock();
-        }
-        if (wake_flags & WF_MIGRATED)
-                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-#endif /* CONFIG_SMP */
-        schedstat_inc(rq, ttwu_count);
-        schedstat_inc(p, se.statistics.nr_wakeups);
-        if (wake_flags & WF_SYNC)
-                schedstat_inc(p, se.statistics.nr_wakeups_sync);
-#endif /* CONFIG_SCHEDSTATS */
-}
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
-{
-        activate_task(rq, p, en_flags);
-        p->on_rq = 1;
-        /* if a worker is waking up, notify workqueue */
-        if (p->flags & PF_WQ_WORKER)
-                wq_worker_waking_up(p, cpu_of(rq));
-}
-/*
- * Mark the task runnable and perform wakeup-preemption.
- */
-static void
-ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
-{
-        trace_sched_wakeup(p, true);
-        check_preempt_curr(rq, p, wake_flags);
-        p->state = TASK_RUNNING;
-#ifdef CONFIG_SMP
-        if (p->sched_class->task_woken)
-                p->sched_class->task_woken(rq, p);
-        if (rq->idle_stamp) {
-                u64 delta = rq->clock - rq->idle_stamp;
-                u64 max = 2*sysctl_sched_migration_cost;
-                if (delta > max)
-                        rq->avg_idle = max;
-                else
-                        update_avg(&rq->avg_idle, delta);
-                rq->idle_stamp = 0;
-        }
-#endif
-}
-static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
-{
-#ifdef CONFIG_SMP
-        if (p->sched_contributes_to_load)
-                rq->nr_uninterruptible--;
-#endif
-        ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
-        ttwu_do_wakeup(rq, p, wake_flags);
-}
-/*
- * Called in case the task @p isn't fully descheduled from its runqueue,
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
- * since all we need to do is flip p->state to TASK_RUNNING, since
- * the task is still ->on_rq.
- */
-static int ttwu_remote(struct task_struct *p, int wake_flags)
-{
-        struct rq *rq;
-        int ret = 0;
-        rq = __task_rq_lock(p);
-        if (p->on_rq) {
-                ttwu_do_wakeup(rq, p, wake_flags);
-                ret = 1;
-        }
-        __task_rq_unlock(rq);
-        return ret;
-}
-#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
-{
-        struct rq *rq = this_rq();
-        struct llist_node *llist = llist_del_all(&rq->wake_list);
-        struct task_struct *p;
-        raw_spin_lock(&rq->lock);
-        while (llist) {
-                p = llist_entry(llist, struct task_struct, wake_entry);
-                llist = llist_next(llist);
-                ttwu_do_activate(rq, p, 0);
-        }
-        raw_spin_unlock(&rq->lock);
-}
-void scheduler_ipi(void)
-{
-        if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
-                return;
-        /*
-         * Not all reschedule IPI handlers call irq_enter/irq_exit, since
-         * traditionally all their work was done from the interrupt return
-         * path. Now that we actually do some work, we need to make sure
-         * we do call them.
-         *
-         * Some archs already do call them, luckily irq_enter/exit nest
-         * properly.
-         *
-         * Arguably we should visit all archs and update all handlers,
-         * however a fair share of IPIs are still resched only so this would
-         * somewhat pessimize the simple resched case.
-         */
-        irq_enter();
-        sched_ttwu_pending();
-        /*
-         * Check if someone kicked us for doing the nohz idle load balance.
-         */
-        if (unlikely(got_nohz_idle_kick() && !need_resched())) {
-                this_rq()->idle_balance = 1;
-                raise_softirq_irqoff(SCHED_SOFTIRQ);
-        }
-        irq_exit();
-}
-static void ttwu_queue_remote(struct task_struct *p, int cpu)
-{
-        if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
-                smp_send_reschedule(cpu);
-}
-bool cpus_share_cache(int this_cpu, int that_cpu)
-{
-        return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
-}
-#endif /* CONFIG_SMP */
-static void ttwu_queue(struct task_struct *p, int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-#if defined(CONFIG_SMP)
-        if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
-                sched_clock_cpu(cpu); /* sync clocks x-cpu */
-                ttwu_queue_remote(p, cpu);
-                return;
-        }
-#endif
-        raw_spin_lock(&rq->lock);
-        ttwu_do_activate(rq, p, 0);
-        raw_spin_unlock(&rq->lock);
-}
-/**
- * try_to_wake_up - wake up a thread
- * @p: the thread to be awakened
- * @state: the mask of task states that can be woken
- * @wake_flags: wake modifier flags (WF_*)
- *
- * Put it on the run-queue if it's not already there. The "current"
- * thread is always on the run-queue (except when the actual
- * re-schedule is in progress), and as such you're allowed to do
- * the simpler "current->state = TASK_RUNNING" to mark yourself
- * runnable without the overhead of this.
- *
- * Returns %true if @p was woken up, %false if it was already running
- * or @state didn't match @p's state.
- */
-static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
-{
-        unsigned long flags;
-        int cpu, success = 0;
-        smp_wmb();
-        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        if (!(p->state & state))
-                goto out;
-        success = 1; /* we're going to change ->state */
-        cpu = task_cpu(p);
-        if (p->on_rq && ttwu_remote(p, wake_flags))
-                goto stat;
-#ifdef CONFIG_SMP
-        /*
-         * If the owning (remote) cpu is still in the middle of schedule() with
-         * this task as prev, wait until its done referencing the task.
-         */
-        while (p->on_cpu)
-                cpu_relax();
-        /*
-         * Pairs with the smp_wmb() in finish_lock_switch().
-         */
-        smp_rmb();
-        p->sched_contributes_to_load = !!task_contributes_to_load(p);
-        p->state = TASK_WAKING;
-        if (p->sched_class->task_waking)
-                p->sched_class->task_waking(p);
-        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-        if (task_cpu(p) != cpu) {
-                wake_flags |= WF_MIGRATED;
-                set_task_cpu(p, cpu);
-        }
-#endif /* CONFIG_SMP */
-        ttwu_queue(p, cpu);
-stat:
-        ttwu_stat(p, cpu, wake_flags);
-out:
-        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-        return success;
-}
-/**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
- */
-static void try_to_wake_up_local(struct task_struct *p)
-{
-        struct rq *rq = task_rq(p);
-        BUG_ON(rq != this_rq());
-        BUG_ON(p == current);
-        lockdep_assert_held(&rq->lock);
-        if (!raw_spin_trylock(&p->pi_lock)) {
-                raw_spin_unlock(&rq->lock);
-                raw_spin_lock(&p->pi_lock);
-                raw_spin_lock(&rq->lock);
-        }
-        if (!(p->state & TASK_NORMAL))
-                goto out;
-        if (!p->on_rq)
-                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-        ttwu_do_wakeup(rq, p, 0);
-        ttwu_stat(p, smp_processor_id(), 0);
-out:
-        raw_spin_unlock(&p->pi_lock);
-}
-/**
- * wake_up_process - Wake up a specific process
- * @p: The process to be woken up.
- *
- * Attempt to wake up the nominated process and move it to the set of runnable
- * processes.  Returns 1 if the process was woken up, 0 if it was already
- * running.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-int wake_up_process(struct task_struct *p)
-{
-        return try_to_wake_up(p, TASK_ALL, 0);
-}
-EXPORT_SYMBOL(wake_up_process);
-int wake_up_state(struct task_struct *p, unsigned int state)
-{
-        return try_to_wake_up(p, state, 0);
-}
-/*
- * Perform scheduler related setup for a newly forked process p.
- * p is forked by current.
- *
- * __sched_fork() is basic setup used by init_idle() too:
- */
-static void __sched_fork(struct task_struct *p)
-{
-        p->on_rq                        = 0;
-        p->se.on_rq                     = 0;
-        p->se.exec_start                = 0;
-        p->se.sum_exec_runtime          = 0;
-        p->se.prev_sum_exec_runtime     = 0;
-        p->se.nr_migrations             = 0;
-        p->se.vruntime                  = 0;
-        INIT_LIST_HEAD(&p->se.group_node);
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
-        p->se.avg.runnable_avg_period = 0;
-        p->se.avg.runnable_avg_sum = 0;
-#endif
-#ifdef CONFIG_SCHEDSTATS
-        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
-#endif
-        INIT_LIST_HEAD(&p->rt.run_list);
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-        INIT_HLIST_HEAD(&p->preempt_notifiers);
-#endif
-#ifdef CONFIG_NUMA_BALANCING
-        if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
-                p->mm->numa_next_scan = jiffies;
-                p->mm->numa_next_reset = jiffies;
-                p->mm->numa_scan_seq = 0;
-        }
-        p->node_stamp = 0ULL;
-        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-        p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
-        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
-        p->numa_work.next = &p->numa_work;
-#endif /* CONFIG_NUMA_BALANCING */
-}
-#ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_SCHED_DEBUG
-void set_numabalancing_state(bool enabled)
-{
-        if (enabled)
-                sched_feat_set("NUMA");
-        else
-                sched_feat_set("NO_NUMA");
-}
-#else
-__read_mostly bool numabalancing_enabled;
-void set_numabalancing_state(bool enabled)
-{
-        numabalancing_enabled = enabled;
-}
-#endif /* CONFIG_SCHED_DEBUG */
-#endif /* CONFIG_NUMA_BALANCING */
-/*
- * fork()/clone()-time setup:
- */
-void sched_fork(struct task_struct *p)
-{
-        unsigned long flags;
-        int cpu = get_cpu();
-        __sched_fork(p);
-        /*
-         * We mark the process as running here. This guarantees that
-         * nobody will actually run it, and a signal or other external
-         * event cannot wake it up and insert it on the runqueue either.
-         */
-        p->state = TASK_RUNNING;
-        /*
-         * Make sure we do not leak PI boosting priority to the child.
-         */
-        p->prio = current->normal_prio;
-        /*
-         * Revert to default priority/policy on fork if requested.
-         */
-        if (unlikely(p->sched_reset_on_fork)) {
-                if (task_has_rt_policy(p)) {
-                        p->policy = SCHED_NORMAL;
-                        p->static_prio = NICE_TO_PRIO(0);
-                        p->rt_priority = 0;
-                } else if (PRIO_TO_NICE(p->static_prio) < 0)
-                        p->static_prio = NICE_TO_PRIO(0);
-                p->prio = p->normal_prio = __normal_prio(p);
-                set_load_weight(p);
-                /*
-                 * We don't need the reset flag anymore after the fork. It has
-                 * fulfilled its duty:
-                 */
-                p->sched_reset_on_fork = 0;
-        }
-        if (!rt_prio(p->prio))
-                p->sched_class = &fair_sched_class;
-        if (p->sched_class->task_fork)
-                p->sched_class->task_fork(p);
-        /*
-         * The child is not yet in the pid-hash so no cgroup attach races,
-         * and the cgroup is pinned to this child due to cgroup_fork()
-         * is ran before sched_fork().
-         *
-         * Silence PROVE_RCU.
-         */
-        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        set_task_cpu(p, cpu);
-        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-        if (likely(sched_info_on()))
-                memset(&p->sched_info, 0, sizeof(p->sched_info));
-#endif
-#if defined(CONFIG_SMP)
-        p->on_cpu = 0;
-#endif
-#ifdef CONFIG_PREEMPT_COUNT
-        /* Want to start with kernel preemption disabled. */
-        task_thread_info(p)->preempt_count = 1;
-#endif
-#ifdef CONFIG_SMP
-        plist_node_init(&p->pushable_tasks, MAX_PRIO);
-#endif
-        put_cpu();
-}
-/*
- * wake_up_new_task - wake up a newly created task for the first time.
- *
- * This function will do some initial scheduler statistics housekeeping
- * that must be done for every newly created context, then puts the task
- * on the runqueue and wakes it.
- */
-void wake_up_new_task(struct task_struct *p)
-{
-        unsigned long flags;
-        struct rq *rq;
-        raw_spin_lock_irqsave(&p->pi_lock, flags);
-#ifdef CONFIG_SMP
-        /*
-         * Fork balancing, do it here and not earlier because:
-         *  - cpus_allowed can change in the fork path
-         *  - any previously selected cpu might disappear through hotplug
-         */
-        set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
-#endif
-        rq = __task_rq_lock(p);
-        activate_task(rq, p, 0);
-        p->on_rq = 1;
-        trace_sched_wakeup_new(p, true);
-        check_preempt_curr(rq, p, WF_FORK);
-#ifdef CONFIG_SMP
-        if (p->sched_class->task_woken)
-                p->sched_class->task_woken(rq, p);
-#endif
-        task_rq_unlock(rq, p, &flags);
-}
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-/**
- * preempt_notifier_register - tell me when current is being preempted & rescheduled
- * @notifier: notifier struct to register
- */
-void preempt_notifier_register(struct preempt_notifier *notifier)
-{
-        hlist_add_head(&notifier->link, &current->preempt_notifiers);
-}
-EXPORT_SYMBOL_GPL(preempt_notifier_register);
-/**
- * preempt_notifier_unregister - no longer interested in preemption notifications
- * @notifier: notifier struct to unregister
- *
- * This is safe to call from within a preemption notifier.
- */
-void preempt_notifier_unregister(struct preempt_notifier *notifier)
-{
-        hlist_del(&notifier->link);
-}
-EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
-{
-        struct preempt_notifier *notifier;
-        struct hlist_node *node;
-        hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
-                notifier->ops->sched_in(notifier, raw_smp_processor_id());
-}
-static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-                                 struct task_struct *next)
-{
-        struct preempt_notifier *notifier;
-        struct hlist_node *node;
-        hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
-                notifier->ops->sched_out(notifier, next);
-}
-#else /* !CONFIG_PREEMPT_NOTIFIERS */
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
-{
-}
-static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-                                 struct task_struct *next)
-{
-}
-#endif /* CONFIG_PREEMPT_NOTIFIERS */
-/**
- * prepare_task_switch - prepare to switch tasks
- * @rq: the runqueue preparing to switch
- * @prev: the current task that is being switched out
- * @next: the task we are going to switch to.
- *
- * This is called with the rq lock held and interrupts off. It must
- * be paired with a subsequent finish_task_switch after the context
- * switch.
- *
- * prepare_task_switch sets up locking and calls architecture specific
- * hooks.
- */
-static inline void
-prepare_task_switch(struct rq *rq, struct task_struct *prev,
-                    struct task_struct *next)
-{
-        trace_sched_switch(prev, next);
-        sched_info_switch(prev, next);
-        perf_event_task_sched_out(prev, next);
-        fire_sched_out_preempt_notifiers(prev, next);
-        prepare_lock_switch(rq, next);
-        prepare_arch_switch(next);
-}
-/**
- * finish_task_switch - clean up after a task-switch
- * @rq: runqueue associated with task-switch
- * @prev: the thread we just switched away from.
- *
- * finish_task_switch must be called after the context switch, paired
- * with a prepare_task_switch call before the context switch.
- * finish_task_switch will reconcile locking set up by prepare_task_switch,
- * and do any other architecture-specific cleanup actions.
- *
- * Note that we may have delayed dropping an mm in context_switch(). If
- * so, we finish that here outside of the runqueue lock. (Doing it
- * with the lock held can cause deadlocks; see schedule() for
- * details.)
- */
-static void finish_task_switch(struct rq *rq, struct task_struct *prev)
-        __releases(rq->lock)
-{
-        struct mm_struct *mm = rq->prev_mm;
-        long prev_state;
-        rq->prev_mm = NULL;
-        /*
-         * A task struct has one reference for the use as "current".
-         * If a task dies, then it sets TASK_DEAD in tsk->state and calls
-         * schedule one last time. The schedule call will never return, and
-         * the scheduled task must drop that reference.
-         * The test for TASK_DEAD must occur while the runqueue locks are
-         * still held, otherwise prev could be scheduled on another cpu, die
-         * there before we look at prev->state, and then the reference would
-         * be dropped twice.
-         *              Manfred Spraul <manfred@colorfullife.com>
-         */
-        prev_state = prev->state;
-        vtime_task_switch(prev);
-        finish_arch_switch(prev);
-        perf_event_task_sched_in(prev, current);
-        finish_lock_switch(rq, prev);
-        finish_arch_post_lock_switch();
-        fire_sched_in_preempt_notifiers(current);
-        if (mm)
-                mmdrop(mm);
-        if (unlikely(prev_state == TASK_DEAD)) {
-                /*
-                 * Remove function-return probe instances associated with this
-                 * task and put them back on the free list.
-                 */
-                kprobe_flush_task(prev);
-                put_task_struct(prev);
-        }
-}
-#ifdef CONFIG_SMP
-/* assumes rq->lock is held */
-static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
-{
-        if (prev->sched_class->pre_schedule)
-                prev->sched_class->pre_schedule(rq, prev);
-}
-/* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
-{
-        if (rq->post_schedule) {
-                unsigned long flags;
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                if (rq->curr->sched_class->post_schedule)
-                        rq->curr->sched_class->post_schedule(rq);
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-                rq->post_schedule = 0;
-        }
-}
-#else
-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
-static inline void post_schedule(struct rq *rq)
-{
-}
-#endif
-/**
- * schedule_tail - first thing a freshly forked thread must call.
- * @prev: the thread we just switched away from.
- */
-asmlinkage void schedule_tail(struct task_struct *prev)
-        __releases(rq->lock)
-{
-        struct rq *rq = this_rq();
-        finish_task_switch(rq, prev);
-        /*
-         * FIXME: do we need to worry about rq being invalidated by the
-         * task_switch?
-         */
-        post_schedule(rq);
-#ifdef __ARCH_WANT_UNLOCKED_CTXSW
-        /* In this case, finish_task_switch does not reenable preemption */
-        preempt_enable();
-#endif
-        if (current->set_child_tid)
-                put_user(task_pid_vnr(current), current->set_child_tid);
-}
-/*
- * context_switch - switch to the new MM and the new
- * thread's register state.
- */
-static inline void
-context_switch(struct rq *rq, struct task_struct *prev,
-               struct task_struct *next)
-{
-        struct mm_struct *mm, *oldmm;
-        prepare_task_switch(rq, prev, next);
-        mm = next->mm;
-        oldmm = prev->active_mm;
-        /*
-         * For paravirt, this is coupled with an exit in switch_to to
-         * combine the page table reload and the switch backend into
-         * one hypercall.
-         */
-        arch_start_context_switch(prev);
-        if (!mm) {
-                next->active_mm = oldmm;
-                atomic_inc(&oldmm->mm_count);
-                enter_lazy_tlb(oldmm, next);
-        } else
-                switch_mm(oldmm, mm, next);
-        if (!prev->mm) {
-                prev->active_mm = NULL;
-                rq->prev_mm = oldmm;
-        }
-        /*
-         * Since the runqueue lock will be released by the next
-         * task (which is an invalid locking op but in the case
-         * of the scheduler it's an obvious special-case), so we
-         * do an early lockdep release here:
-         */
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
-        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-#endif
-        context_tracking_task_switch(prev, next);
-        /* Here we just switch the register state and the stack. */
-        switch_to(prev, next, prev);
-        barrier();
-        /*
-         * this_rq must be evaluated again because prev may have moved
-         * CPUs since it called schedule(), thus the 'rq' on its stack
-         * frame will be invalid.
-         */
-        finish_task_switch(this_rq(), prev);
-}
-/*
- * nr_running, nr_uninterruptible and nr_context_switches:
- *
- * externally visible scheduler statistics: current number of runnable
- * threads, current number of uninterruptible-sleeping threads, total
- * number of context switches performed since bootup.
- */
-unsigned long nr_running(void)
-{
-        unsigned long i, sum = 0;
-        for_each_online_cpu(i)
-                sum += cpu_rq(i)->nr_running;
-        return sum;
-}
-unsigned long nr_uninterruptible(void)
-{
-        unsigned long i, sum = 0;
-        for_each_possible_cpu(i)
-                sum += cpu_rq(i)->nr_uninterruptible;
-        /*
-         * Since we read the counters lockless, it might be slightly
-         * inaccurate. Do not allow it to go below zero though:
-         */
-        if (unlikely((long)sum < 0))
-                sum = 0;
-        return sum;
-}
-unsigned long long nr_context_switches(void)
-{
-        int i;
-        unsigned long long sum = 0;
-        for_each_possible_cpu(i)
-                sum += cpu_rq(i)->nr_switches;
-        return sum;
-}
-unsigned long nr_iowait(void)
-{
-        unsigned long i, sum = 0;
-        for_each_possible_cpu(i)
-                sum += atomic_read(&cpu_rq(i)->nr_iowait);
-        return sum;
-}
-unsigned long nr_iowait_cpu(int cpu)
-{
-        struct rq *this = cpu_rq(cpu);
-        return atomic_read(&this->nr_iowait);
-}
-unsigned long this_cpu_load(void)
-{
-        struct rq *this = this_rq();
-        return this->cpu_load[0];
-}
-/*
- * Global load-average calculations
- *
- * We take a distributed and async approach to calculating the global load-avg
- * in order to minimize overhead.
- *
- * The global load average is an exponentially decaying average of nr_running +
- * nr_uninterruptible.
- *
- * Once every LOAD_FREQ:
- *
- *   nr_active = 0;
- *   for_each_possible_cpu(cpu)
- *      nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
- *
- *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
- *
- * Due to a number of reasons the above turns in the mess below:
- *
- *  - for_each_possible_cpu() is prohibitively expensive on machines with
- *    serious number of cpus, therefore we need to take a distributed approach
- *    to calculating nr_active.
- *
- *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
- *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
- *
- *    So assuming nr_active := 0 when we start out -- true per definition, we
- *    can simply take per-cpu deltas and fold those into a global accumulate
- *    to obtain the same result. See calc_load_fold_active().
- *
- *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
- *    across the machine, we assume 10 ticks is sufficient time for every
- *    cpu to have completed this task.
- *
- *    This places an upper-bound on the IRQ-off latency of the machine. Then
- *    again, being late doesn't loose the delta, just wrecks the sample.
- *
- *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
- *    this would add another cross-cpu cacheline miss and atomic operation
- *    to the wakeup path. Instead we increment on whatever cpu the task ran
- *    when it went into uninterruptible state and decrement on whatever cpu
- *    did the wakeup. This means that only the sum of nr_uninterruptible over
- *    all cpus yields the correct result.
- *
- *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
- */
-/* Variables and functions for calc_load */
-static atomic_long_t calc_load_tasks;
-static unsigned long calc_load_update;
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun); /* should be removed */
-/**
- * get_avenrun - get the load average array
- * @loads:      pointer to dest load array
- * @offset:     offset to add
- * @shift:      shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-        loads[0] = (avenrun[0] + offset) << shift;
-        loads[1] = (avenrun[1] + offset) << shift;
-        loads[2] = (avenrun[2] + offset) << shift;
-}
-static long calc_load_fold_active(struct rq *this_rq)
-{
-        long nr_active, delta = 0;
-        nr_active = this_rq->nr_running;
-        nr_active += (long) this_rq->nr_uninterruptible;
-        if (nr_active != this_rq->calc_load_active) {
-                delta = nr_active - this_rq->calc_load_active;
-                this_rq->calc_load_active = nr_active;
-        }
-        return delta;
-}
-/*
- * a1 = a0 * e + a * (1 - e)
- */
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-        load *= exp;
-        load += active * (FIXED_1 - exp);
-        load += 1UL << (FSHIFT - 1);
-        return load >> FSHIFT;
-}
-#ifdef CONFIG_NO_HZ
-/*
- * Handle NO_HZ for the global load-average.
- *
- * Since the above described distributed algorithm to compute the global
- * load-average relies on per-cpu sampling from the tick, it is affected by
- * NO_HZ.
- *
- * The basic idea is to fold the nr_active delta into a global idle-delta upon
- * entering NO_HZ state such that we can include this as an 'extra' cpu delta
- * when we read the global state.
- *
- * Obviously reality has to ruin such a delightfully simple scheme:
- *
- *  - When we go NO_HZ idle during the window, we can negate our sample
- *    contribution, causing under-accounting.
- *
- *    We avoid this by keeping two idle-delta counters and flipping them
- *    when the window starts, thus separating old and new NO_HZ load.
- *
- *    The only trick is the slight shift in index flip for read vs write.
- *
- *        0s            5s            10s           15s
- *          +10           +10           +10           +10
- *        |-|-----------|-|-----------|-|-----------|-|
- *    r:0 0 1           1 0           0 1           1 0
- *    w:0 1 1           0 0           1 1           0 0
- *
- *    This ensures we'll fold the old idle contribution in this window while
- *    accumlating the new one.
- *
- *  - When we wake up from NO_HZ idle during the window, we push up our
- *    contribution, since we effectively move our sample point to a known
- *    busy state.
- *
- *    This is solved by pushing the window forward, and thus skipping the
- *    sample, for this cpu (effectively using the idle-delta for this cpu which
- *    was in effect at the time the window opened). This also solves the issue
- *    of having to deal with a cpu having been in NOHZ idle for multiple
- *    LOAD_FREQ intervals.
- *
- * When making the ILB scale, we should try to pull this in as well.
- */
-static atomic_long_t calc_load_idle[2];
-static int calc_load_idx;
-static inline int calc_load_write_idx(void)
-{
-        int idx = calc_load_idx;
-        /*
-         * See calc_global_nohz(), if we observe the new index, we also
-         * need to observe the new update time.
-         */
-        smp_rmb();
-        /*
-         * If the folding window started, make sure we start writing in the
-         * next idle-delta.
-         */
-        if (!time_before(jiffies, calc_load_update))
-                idx++;
-        return idx & 1;
-}
-static inline int calc_load_read_idx(void)
-{
-        return calc_load_idx & 1;
-}
-void calc_load_enter_idle(void)
-{
-        struct rq *this_rq = this_rq();
-        long delta;
-        /*
-         * We're going into NOHZ mode, if there's any pending delta, fold it
-         * into the pending idle delta.
-         */
-        delta = calc_load_fold_active(this_rq);
-        if (delta) {
-                int idx = calc_load_write_idx();
-                atomic_long_add(delta, &calc_load_idle[idx]);
-        }
-}
-void calc_load_exit_idle(void)
-{
-        struct rq *this_rq = this_rq();
-        /*
-         * If we're still before the sample window, we're done.
-         */
-        if (time_before(jiffies, this_rq->calc_load_update))
-                return;
-        /*
-         * We woke inside or after the sample window, this means we're already
-         * accounted through the nohz accounting, so skip the entire deal and
-         * sync up for the next window.
-         */
-        this_rq->calc_load_update = calc_load_update;
-        if (time_before(jiffies, this_rq->calc_load_update + 10))
-                this_rq->calc_load_update += LOAD_FREQ;
-}
-static long calc_load_fold_idle(void)
-{
-        int idx = calc_load_read_idx();
-        long delta = 0;
-        if (atomic_long_read(&calc_load_idle[idx]))
-                delta = atomic_long_xchg(&calc_load_idle[idx], 0);
-        return delta;
-}
-/**
- * fixed_power_int - compute: x^n, in O(log n) time
- *
- * @x:         base of the power
- * @frac_bits: fractional bits of @x
- * @n:         power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- */
-static unsigned long
-fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
-{
-        unsigned long result = 1UL << frac_bits;
-        if (n) for (;;) {
-                if (n & 1) {
-                        result *= x;
-                        result += 1UL << (frac_bits - 1);
-                        result >>= frac_bits;
-                }
-                n >>= 1;
-                if (!n)
-                        break;
-                x *= x;
-                x += 1UL << (frac_bits - 1);
-                x >>= frac_bits;
-        }
-        return result;
-}
-/*
- * a1 = a0 * e + a * (1 - e)
- *
- * a2 = a1 * e + a * (1 - e)
- *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
- *    = a0 * e^2 + a * (1 - e) * (1 + e)
- *
- * a3 = a2 * e + a * (1 - e)
- *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
- *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
- *
- *  ...
- *
- * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
- *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
- *    = a0 * e^n + a * (1 - e^n)
- *
- * [1] application of the geometric series:
- *
- *              n         1 - x^(n+1)
- *     S_n := \Sum x^i = -------------
- *             i=0          1 - x
- */
-static unsigned long
-calc_load_n(unsigned long load, unsigned long exp,
-            unsigned long active, unsigned int n)
-{
-        return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
-}
-/*
- * NO_HZ can leave us missing all per-cpu ticks calling
- * calc_load_account_active(), but since an idle CPU folds its delta into
- * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
- * in the pending idle delta if our idle period crossed a load cycle boundary.
- *
- * Once we've updated the global active value, we need to apply the exponential
- * weights adjusted to the number of cycles missed.
- */
-static void calc_global_nohz(void)
-{
-        long delta, active, n;
-        if (!time_before(jiffies, calc_load_update + 10)) {
-                /*
-                 * Catch-up, fold however many we are behind still
-                 */
-                delta = jiffies - calc_load_update - 10;
-                n = 1 + (delta / LOAD_FREQ);
-                active = atomic_long_read(&calc_load_tasks);
-                active = active > 0 ? active * FIXED_1 : 0;
-                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-                calc_load_update += n * LOAD_FREQ;
-        }
-        /*
-         * Flip the idle index...
-         *
-         * Make sure we first write the new time then flip the index, so that
-         * calc_load_write_idx() will see the new time when it reads the new
-         * index, this avoids a double flip messing things up.
-         */
-        smp_wmb();
-        calc_load_idx++;
-}
-#else /* !CONFIG_NO_HZ */
-static inline long calc_load_fold_idle(void) { return 0; }
-static inline void calc_global_nohz(void) { }
-#endif /* CONFIG_NO_HZ */
-/*
- * calc_load - update the avenrun load estimates 10 ticks after the
- * CPUs have updated calc_load_tasks.
- */
-void calc_global_load(unsigned long ticks)
-{
-        long active, delta;
-        if (time_before(jiffies, calc_load_update + 10))
-                return;
-        /*
-         * Fold the 'old' idle-delta to include all NO_HZ cpus.
-         */
-        delta = calc_load_fold_idle();
-        if (delta)
-                atomic_long_add(delta, &calc_load_tasks);
-        active = atomic_long_read(&calc_load_tasks);
-        active = active > 0 ? active * FIXED_1 : 0;
-        avenrun[0] = calc_load(avenrun[0], EXP_1, active);
-        avenrun[1] = calc_load(avenrun[1], EXP_5, active);
-        avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-        calc_load_update += LOAD_FREQ;
-        /*
-         * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
-         */
-        calc_global_nohz();
-}
-/*
- * Called from update_cpu_load() to periodically update this CPU's
- * active count.
- */
-static void calc_load_account_active(struct rq *this_rq)
-{
-        long delta;
-        if (time_before(jiffies, this_rq->calc_load_update))
-                return;
-        delta  = calc_load_fold_active(this_rq);
-        if (delta)
-                atomic_long_add(delta, &calc_load_tasks);
-        this_rq->calc_load_update += LOAD_FREQ;
-}
-/*
- * End of global load-average stuff
- */
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT           7
-static const unsigned char
-                degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
-                degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-                                        {0, 0, 0, 0, 0, 0, 0, 0},
-                                        {64, 32, 8, 0, 0, 0, 0, 0},
-                                        {96, 72, 40, 12, 1, 0, 0},
-                                        {112, 98, 75, 43, 15, 1, 0},
-                                        {120, 112, 98, 76, 45, 16, 2} };
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
-        int j = 0;
-        if (!missed_updates)
-                return load;
-        if (missed_updates >= degrade_zero_ticks[idx])
-                return 0;
-        if (idx == 1)
-                return load >> missed_updates;
-        while (missed_updates) {
-                if (missed_updates % 2)
-                        load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-                missed_updates >>= 1;
-                j++;
-        }
-        return load;
-}
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
-                              unsigned long pending_updates)
-{
-        int i, scale;
-        this_rq->nr_load_updates++;
-        /* Update our load: */
-        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
-        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-                unsigned long old_load, new_load;
-                /* scale is effectively 1 << i now, and >> i divides by scale */
-                old_load = this_rq->cpu_load[i];
-                old_load = decay_load_missed(old_load, pending_updates - 1, i);
-                new_load = this_load;
-                /*
-                 * Round up the averaging division if load is increasing. This
-                 * prevents us from getting stuck on 9 if the load is 10, for
-                 * example.
-                 */
-                if (new_load > old_load)
-                        new_load += scale - 1;
-                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
-        }
-        sched_avg_update(this_rq);
-}
-#ifdef CONFIG_NO_HZ
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
-        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-        unsigned long load = this_rq->load.weight;
-        unsigned long pending_updates;
-        /*
-         * bail if there's load or we're actually up-to-date.
-         */
-        if (load || curr_jiffies == this_rq->last_load_update_tick)
-                return;
-        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-        this_rq->last_load_update_tick = curr_jiffies;
-        __update_cpu_load(this_rq, load, pending_updates);
-}
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
-        struct rq *this_rq = this_rq();
-        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-        unsigned long pending_updates;
-        if (curr_jiffies == this_rq->last_load_update_tick)
-                return;
-        raw_spin_lock(&this_rq->lock);
-        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-        if (pending_updates) {
-                this_rq->last_load_update_tick = curr_jiffies;
-                /*
-                 * We were idle, this means load 0, the current load might be
-                 * !0 due to remote wakeups and the sort.
-                 */
-                __update_cpu_load(this_rq, 0, pending_updates);
-        }
-        raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ */
-/*
- * Called from scheduler_tick()
- */
-static void update_cpu_load_active(struct rq *this_rq)
-{
-        /*
-         * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
-         */
-        this_rq->last_load_update_tick = jiffies;
-        __update_cpu_load(this_rq, this_rq->load.weight, 1);
-        calc_load_account_active(this_rq);
-}
-#ifdef CONFIG_SMP
-/*
- * sched_exec - execve() is a valuable balancing opportunity, because at
- * this point the task has the smallest effective memory and cache footprint.
- */
-void sched_exec(void)
-{
-        struct task_struct *p = current;
-        unsigned long flags;
-        int dest_cpu;
-        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
-        if (dest_cpu == smp_processor_id())
-                goto unlock;
-        if (likely(cpu_active(dest_cpu))) {
-                struct migration_arg arg = { p, dest_cpu };
-                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-                stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
-                return;
-        }
-unlock:
-        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-}
-#endif
-DEFINE_PER_CPU(struct kernel_stat, kstat);
-DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
-EXPORT_PER_CPU_SYMBOL(kstat);
-EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
-/*
- * Return any ns on the sched_clock that have not yet been accounted in
- * @p in case that task is currently running.
- *
- * Called with task_rq_lock() held on @rq.
- */
-static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
-{
-        u64 ns = 0;
-        if (task_current(rq, p)) {
-                update_rq_clock(rq);
-                ns = rq->clock_task - p->se.exec_start;
-                if ((s64)ns < 0)
-                        ns = 0;
-        }
-        return ns;
-}
-unsigned long long task_delta_exec(struct task_struct *p)
-{
-        unsigned long flags;
-        struct rq *rq;
-        u64 ns = 0;
-        rq = task_rq_lock(p, &flags);
-        ns = do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, p, &flags);
-        return ns;
-}
-/*
- * Return accounted runtime for the task.
- * In case the task is currently running, return the runtime plus current's
- * pending runtime that have not been accounted yet.
- */
-unsigned long long task_sched_runtime(struct task_struct *p)
-{
-        unsigned long flags;
-        struct rq *rq;
-        u64 ns = 0;
-        rq = task_rq_lock(p, &flags);
-        ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, p, &flags);
-        return ns;
-}
-/*
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled.
- */
-void scheduler_tick(void)
-{
-        int cpu = smp_processor_id();
-        struct rq *rq = cpu_rq(cpu);
-        struct task_struct *curr = rq->curr;
-        sched_clock_tick();
-        raw_spin_lock(&rq->lock);
-        update_rq_clock(rq);
-        update_cpu_load_active(rq);
-        curr->sched_class->task_tick(rq, curr, 0);
-        raw_spin_unlock(&rq->lock);
-        perf_event_task_tick();
-#ifdef CONFIG_SMP
-        rq->idle_balance = idle_cpu(cpu);
-        trigger_load_balance(rq, cpu);
-#endif
-}
-notrace unsigned long get_parent_ip(unsigned long addr)
-{
-        if (in_lock_functions(addr)) {
-                addr = CALLER_ADDR2;
-                if (in_lock_functions(addr))
-                        addr = CALLER_ADDR3;
-        }
-        return addr;
-}
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-                                defined(CONFIG_PREEMPT_TRACER))
-void __kprobes add_preempt_count(int val)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
-        /*
-         * Underflow?
-         */
-        if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
-                return;
-#endif
-        preempt_count() += val;
-#ifdef CONFIG_DEBUG_PREEMPT
-        /*
-         * Spinlock count overflowing soon?
-         */
-        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
-                                PREEMPT_MASK - 10);
-#endif
-        if (preempt_count() == val)
-                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-}
-EXPORT_SYMBOL(add_preempt_count);
-void __kprobes sub_preempt_count(int val)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
-        /*
-         * Underflow?
-         */
-        if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
-                return;
-        /*
-         * Is the spinlock portion underflowing?
-         */
-        if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
-                        !(preempt_count() & PREEMPT_MASK)))
-                return;
-#endif
-        if (preempt_count() == val)
-                trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-        preempt_count() -= val;
-}
-EXPORT_SYMBOL(sub_preempt_count);
-#endif
-/*
- * Print scheduling while atomic bug:
- */
-static noinline void __schedule_bug(struct task_struct *prev)
-{
-        if (oops_in_progress)
-                return;
-        printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
-                prev->comm, prev->pid, preempt_count());
-        debug_show_held_locks(prev);
-        print_modules();
-        if (irqs_disabled())
-                print_irqtrace_events(prev);
-        dump_stack();
-        add_taint(TAINT_WARN);
-}
-/*
- * Various schedule()-time debugging checks and statistics:
- */
-static inline void schedule_debug(struct task_struct *prev)
-{
-        /*
-         * Test if we are atomic. Since do_exit() needs to call into
-         * schedule() atomically, we ignore that path for now.
-         * Otherwise, whine if we are scheduling when we should not be.
-         */
-        if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
-                __schedule_bug(prev);
-        rcu_sleep_check();
-        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
-        schedstat_inc(this_rq(), sched_count);
-}
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
-        if (prev->on_rq || rq->skip_clock_update < 0)
-                update_rq_clock(rq);
-        prev->sched_class->put_prev_task(rq, prev);
-}
-/*
- * Pick up the highest-prio task:
- */
-static inline struct task_struct *
-pick_next_task(struct rq *rq)
-{
-        const struct sched_class *class;
-        struct task_struct *p;
-        /*
-         * Optimization: we know that if all tasks are in
-         * the fair class we can call that function directly:
-         */
-        if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
-                p = fair_sched_class.pick_next_task(rq);
-                if (likely(p))
-                        return p;
-        }
-        for_each_class(class) {
-                p = class->pick_next_task(rq);
-                if (p)
-                        return p;
-        }
-        BUG(); /* the idle class will always have a runnable task */
-}
-/*
- * __schedule() is the main scheduler function.
- *
- * The main means of driving the scheduler and thus entering this function are:
- *
- *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
- *
- *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
- *      paths. For example, see arch/x86/entry_64.S.
- *
- *      To drive preemption between tasks, the scheduler sets the flag in timer
- *      interrupt handler scheduler_tick().
- *
- *   3. Wakeups don't really cause entry into schedule(). They add a
- *      task to the run-queue and that's it.
- *
- *      Now, if the new task added to the run-queue preempts the current
- *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
- *      called on the nearest possible occasion:
- *
- *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
- *
- *         - in syscall or exception context, at the next outmost
- *           preempt_enable(). (this might be as soon as the wake_up()'s
- *           spin_unlock()!)
- *
- *         - in IRQ context, return from interrupt-handler to
- *           preemptible context
- *
- *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
- *         then at the next:
- *
- *          - cond_resched() call
- *          - explicit schedule() call
- *          - return from syscall or exception to user-space
- *          - return from interrupt-handler to user-space
- */
-static void __sched __schedule(void)
-{
-        struct task_struct *prev, *next;
-        unsigned long *switch_count;
-        struct rq *rq;
-        int cpu;
-need_resched:
-        preempt_disable();
-        cpu = smp_processor_id();
-        rq = cpu_rq(cpu);
-        rcu_note_context_switch(cpu);
-        prev = rq->curr;
-        schedule_debug(prev);
-        if (sched_feat(HRTICK))
-                hrtick_clear(rq);
-        raw_spin_lock_irq(&rq->lock);
-        switch_count = &prev->nivcsw;
-        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-                if (unlikely(signal_pending_state(prev->state, prev))) {
-                        prev->state = TASK_RUNNING;
-                } else {
-                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
-                        prev->on_rq = 0;
-                        /*
-                         * If a worker went to sleep, notify and ask workqueue
-                         * whether it wants to wake up a task to maintain
-                         * concurrency.
-                         */
-                        if (prev->flags & PF_WQ_WORKER) {
-                                struct task_struct *to_wakeup;
-                                to_wakeup = wq_worker_sleeping(prev, cpu);
-                                if (to_wakeup)
-                                        try_to_wake_up_local(to_wakeup);
-                        }
-                }
-                switch_count = &prev->nvcsw;
-        }
-        pre_schedule(rq, prev);
-        if (unlikely(!rq->nr_running))
-                idle_balance(cpu, rq);
-        put_prev_task(rq, prev);
-        next = pick_next_task(rq);
-        clear_tsk_need_resched(prev);
-        rq->skip_clock_update = 0;
-        if (likely(prev != next)) {
-                rq->nr_switches++;
-                rq->curr = next;
-                ++*switch_count;
-                context_switch(rq, prev, next); /* unlocks the rq */
-                /*
-                 * The context switch have flipped the stack from under us
-                 * and restored the local variables which were saved when
-                 * this task called schedule() in the past. prev == current
-                 * is still correct, but it can be moved to another cpu/rq.
-                 */
-                cpu = smp_processor_id();
-                rq = cpu_rq(cpu);
-        } else
-                raw_spin_unlock_irq(&rq->lock);
-        post_schedule(rq);
-        sched_preempt_enable_no_resched();
-        if (need_resched())
-                goto need_resched;
-}
-static inline void sched_submit_work(struct task_struct *tsk)
-{
-        if (!tsk->state || tsk_is_pi_blocked(tsk))
-                return;
-        /*
-         * If we are going to sleep and we have plugged IO queued,
-         * make sure to submit it to avoid deadlocks.
-         */
-        if (blk_needs_flush_plug(tsk))
-                blk_schedule_flush_plug(tsk);
-}
-asmlinkage void __sched schedule(void)
-{
-        struct task_struct *tsk = current;
-        sched_submit_work(tsk);
-        __schedule();
-}
-EXPORT_SYMBOL(schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
-asmlinkage void __sched schedule_user(void)
-{
-        /*
-         * If we come here after a random call to set_need_resched(),
-         * or we have been woken up remotely but the IPI has not yet arrived,
-         * we haven't yet exited the RCU idle mode. Do it here manually until
-         * we find a better solution.
-         */
-        user_exit();
-        schedule();
-        user_enter();
-}
-#endif
-/**
- * schedule_preempt_disabled - called with preemption disabled
- *
- * Returns with preemption disabled. Note: preempt_count must be 1
- */
-void __sched schedule_preempt_disabled(void)
-{
-        sched_preempt_enable_no_resched();
-        schedule();
-        preempt_disable();
-}
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
-        if (lock->owner != owner)
-                return false;
-        /*
-         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * lock->owner still matches owner, if that fails, owner might
-         * point to free()d memory, if it still matches, the rcu_read_lock()
-         * ensures the memory stays valid.
-         */
-        barrier();
-        return owner->on_cpu;
-}
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-{
-        if (!sched_feat(OWNER_SPIN))
-                return 0;
-        rcu_read_lock();
-        while (owner_running(lock, owner)) {
-                if (need_resched())
-                        break;
-                arch_mutex_cpu_relax();
-        }
-        rcu_read_unlock();
-        /*
-         * We break out the loop above on need_resched() and when the
-         * owner changed, which is a sign for heavy contention. Return
-         * success only when lock->owner is NULL.
-         */
-        return lock->owner == NULL;
-}
-#endif
-#ifdef CONFIG_PREEMPT
-/*
- * this is the entry point to schedule() from in-kernel preemption
- * off of preempt_enable. Kernel preemptions off return from interrupt
- * occur there and call schedule directly.
- */
-asmlinkage void __sched notrace preempt_schedule(void)
-{
-        struct thread_info *ti = current_thread_info();
-        /*
-         * If there is a non-zero preempt_count or interrupts are disabled,
-         * we do not want to preempt the current task. Just return..
-         */
-        if (likely(ti->preempt_count || irqs_disabled()))
-                return;
-        do {
-                add_preempt_count_notrace(PREEMPT_ACTIVE);
-                __schedule();
-                sub_preempt_count_notrace(PREEMPT_ACTIVE);
-                /*
-                 * Check again in case we missed a preemption opportunity
-                 * between schedule and now.
-                 */
-                barrier();
-        } while (need_resched());
-}
-EXPORT_SYMBOL(preempt_schedule);
-/*
- * this is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
- */
-asmlinkage void __sched preempt_schedule_irq(void)
-{
-        struct thread_info *ti = current_thread_info();
-        /* Catch callers which need to be fixed */
-        BUG_ON(ti->preempt_count || !irqs_disabled());
-        user_exit();
-        do {
-                add_preempt_count(PREEMPT_ACTIVE);
-                local_irq_enable();
-                __schedule();
-                local_irq_disable();
-                sub_preempt_count(PREEMPT_ACTIVE);
-                /*
-                 * Check again in case we missed a preemption opportunity
-                 * between schedule and now.
-                 */
-                barrier();
-        } while (need_resched());
-}
-#endif /* CONFIG_PREEMPT */
-int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
-                          void *key)
-{
-        return try_to_wake_up(curr->private, mode, wake_flags);
-}
-EXPORT_SYMBOL(default_wake_function);
-/*
- * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
- * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
- *
- * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
- * zero in this (rare) case, and we handle it by continuing to scan the queue.
- */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-                        int nr_exclusive, int wake_flags, void *key)
-{
-        wait_queue_t *curr, *next;
-        list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
-                unsigned flags = curr->flags;
-                if (curr->func(curr, mode, wake_flags, key) &&
-                                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
-                        break;
-        }
-}
-/**
- * __wake_up - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: is directly passed to the wakeup function
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up(wait_queue_head_t *q, unsigned int mode,
-                        int nr_exclusive, void *key)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&q->lock, flags);
-        __wake_up_common(q, mode, nr_exclusive, 0, key);
-        spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(__wake_up);
-/*
- * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
- */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
-{
-        __wake_up_common(q, mode, nr, 0, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
-{
-        __wake_up_common(q, mode, 1, 0, key);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key);
-/**
- * __wake_up_sync_key - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: opaque value to be passed to wakeup targets
- *
- * The sync wakeup differs that the waker knows that it will schedule
- * away soon, so while the target thread will be woken up, it will not
- * be migrated to another CPU - ie. the two threads are 'synchronized'
- * with each other. This can prevent needless bouncing between CPUs.
- *
- * On UP it can prevent extra preemption.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
-                        int nr_exclusive, void *key)
-{
-        unsigned long flags;
-        int wake_flags = WF_SYNC;
-        if (unlikely(!q))
-                return;
-        if (unlikely(!nr_exclusive))
-                wake_flags = 0;
-        spin_lock_irqsave(&q->lock, flags);
-        __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
-        spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync_key);
-/*
- * __wake_up_sync - see __wake_up_sync_key()
- */
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
-{
-        __wake_up_sync_key(q, mode, nr_exclusive, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
-/**
- * complete: - signals a single thread waiting on this completion
- * @x:  holds the state of this particular completion
- *
- * This will wake up a single thread waiting on this completion. Threads will be
- * awakened in the same order in which they were queued.
- *
- * See also complete_all(), wait_for_completion() and related routines.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete(struct completion *x)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        x->done++;
-        __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete);
-/**
- * complete_all: - signals all threads waiting on this completion
- * @x:  holds the state of this particular completion
- *
- * This will wake up all threads waiting on this particular completion event.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete_all(struct completion *x)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        x->done += UINT_MAX/2;
-        __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete_all);
-static inline long __sched
-do_wait_for_common(struct completion *x, long timeout, int state)
-{
-        if (!x->done) {
-                DECLARE_WAITQUEUE(wait, current);
-                __add_wait_queue_tail_exclusive(&x->wait, &wait);
-                do {
-                        if (signal_pending_state(state, current)) {
-                                timeout = -ERESTARTSYS;
-                                break;
-                        }
-                        __set_current_state(state);
-                        spin_unlock_irq(&x->wait.lock);
-                        timeout = schedule_timeout(timeout);
-                        spin_lock_irq(&x->wait.lock);
-                } while (!x->done && timeout);
-                __remove_wait_queue(&x->wait, &wait);
-                if (!x->done)
-                        return timeout;
-        }
-        x->done--;
-        return timeout ?: 1;
-}
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
-{
-        might_sleep();
-        spin_lock_irq(&x->wait.lock);
-        timeout = do_wait_for_common(x, timeout, state);
-        spin_unlock_irq(&x->wait.lock);
-        return timeout;
-}
-/**
- * wait_for_completion: - waits for completion of a task
- * @x:  holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout.
- *
- * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
- * and interrupt capability. Also see complete().
- */
-void __sched wait_for_completion(struct completion *x)
-{
-        wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion);
-/**
- * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible.
- *
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_timeout(struct completion *x, unsigned long timeout)
-{
-        return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_timeout);
-/**
- * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
- * @x:  holds the state of this particular completion
- *
- * This waits for completion of a specific task to be signaled. It is
- * interruptible.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_interruptible(struct completion *x)
-{
-        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
-        if (t == -ERESTARTSYS)
-                return t;
-        return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible);
-/**
- * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. It is interruptible. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_interruptible_timeout(struct completion *x,
-                                          unsigned long timeout)
-{
-        return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
-/**
- * wait_for_completion_killable: - waits for completion of a task (killable)
- * @x:  holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It can be
- * interrupted by a kill signal.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_killable(struct completion *x)
-{
-        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
-        if (t == -ERESTARTSYS)
-                return t;
-        return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_killable);
-/**
- * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be
- * signaled or for a specified timeout to expire. It can be
- * interrupted by a kill signal. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_killable_timeout(struct completion *x,
-                                     unsigned long timeout)
-{
-        return wait_for_common(x, timeout, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(wait_for_completion_killable_timeout);
-/**
- *      try_wait_for_completion - try to decrement a completion without blocking
- *      @x:     completion structure
- *
- *      Returns: 0 if a decrement cannot be done without blocking
- *               1 if a decrement succeeded.
- *
- *      If a completion is being used as a counting completion,
- *      attempt to decrement the counter without blocking. This
- *      enables us to avoid waiting if the resource the completion
- *      is protecting is not available.
- */
-bool try_wait_for_completion(struct completion *x)
-{
-        unsigned long flags;
-        int ret = 1;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        if (!x->done)
-                ret = 0;
-        else
-                x->done--;
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL(try_wait_for_completion);
-/**
- *      completion_done - Test to see if a completion has any waiters
- *      @x:     completion structure
- *
- *      Returns: 0 if there are waiters (wait_for_completion() in progress)
- *               1 if there are no waiters.
- *
- */
-bool completion_done(struct completion *x)
-{
-        unsigned long flags;
-        int ret = 1;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        if (!x->done)
-                ret = 0;
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL(completion_done);
-static long __sched
-sleep_on_common(wait_queue_head_t *q, int state, long timeout)
-{
-        unsigned long flags;
-        wait_queue_t wait;
-        init_waitqueue_entry(&wait, current);
-        __set_current_state(state);
-        spin_lock_irqsave(&q->lock, flags);
-        __add_wait_queue(q, &wait);
-        spin_unlock(&q->lock);
-        timeout = schedule_timeout(timeout);
-        spin_lock_irq(&q->lock);
-        __remove_wait_queue(q, &wait);
-        spin_unlock_irqrestore(&q->lock, flags);
-        return timeout;
-}
-void __sched interruptible_sleep_on(wait_queue_head_t *q)
-{
-        sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(interruptible_sleep_on);
-long __sched
-interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
-        return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(interruptible_sleep_on_timeout);
-void __sched sleep_on(wait_queue_head_t *q)
-{
-        sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(sleep_on);
-long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
-        return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(sleep_on_timeout);
-#ifdef CONFIG_RT_MUTEXES
-/*
- * rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
- *
- * This function changes the 'effective' priority of a task. It does
- * not touch ->normal_prio like __setscheduler().
- *
- * Used by the rt_mutex code to implement priority inheritance logic.
- */
-void rt_mutex_setprio(struct task_struct *p, int prio)
-{
-        int oldprio, on_rq, running;
-        struct rq *rq;
-        const struct sched_class *prev_class;
-        BUG_ON(prio < 0 || prio > MAX_PRIO);
-        rq = __task_rq_lock(p);
-        /*
-         * Idle task boosting is a nono in general. There is one
-         * exception, when PREEMPT_RT and NOHZ is active:
-         *
-         * The idle task calls get_next_timer_interrupt() and holds
-         * the timer wheel base->lock on the CPU and another CPU wants
-         * to access the timer (probably to cancel it). We can safely
-         * ignore the boosting request, as the idle CPU runs this code
-         * with interrupts disabled and will complete the lock
-         * protected section without being interrupted. So there is no
-         * real need to boost.
-         */
-        if (unlikely(p == rq->idle)) {
-                WARN_ON(p != rq->curr);
-                WARN_ON(p->pi_blocked_on);
-                goto out_unlock;
-        }
-        trace_sched_pi_setprio(p, prio);
-        oldprio = p->prio;
-        prev_class = p->sched_class;
-        on_rq = p->on_rq;
-        running = task_current(rq, p);
-        if (on_rq)
-                dequeue_task(rq, p, 0);
-        if (running)
-                p->sched_class->put_prev_task(rq, p);
-        if (rt_prio(prio))
-                p->sched_class = &rt_sched_class;
-        else
-                p->sched_class = &fair_sched_class;
-        p->prio = prio;
-        if (running)
-                p->sched_class->set_curr_task(rq);
-        if (on_rq)
-                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
-        check_class_changed(rq, p, prev_class, oldprio);
-out_unlock:
-        __task_rq_unlock(rq);
-}
-#endif
-void set_user_nice(struct task_struct *p, long nice)
-{
-        int old_prio, delta, on_rq;
-        unsigned long flags;
-        struct rq *rq;
-        if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
-                return;
-        /*
-         * We have to be careful, if called from sys_setpriority(),
-         * the task might be in the middle of scheduling on another CPU.
-         */
-        rq = task_rq_lock(p, &flags);
-        /*
-         * The RT priorities are set via sched_setscheduler(), but we still
-         * allow the 'normal' nice value to be set - but as expected
-         * it wont have any effect on scheduling until the task is
-         * SCHED_FIFO/SCHED_RR:
-         */
-        if (task_has_rt_policy(p)) {
-                p->static_prio = NICE_TO_PRIO(nice);
-                goto out_unlock;
-        }
-        on_rq = p->on_rq;
-        if (on_rq)
-                dequeue_task(rq, p, 0);
-        p->static_prio = NICE_TO_PRIO(nice);
-        set_load_weight(p);
-        old_prio = p->prio;
-        p->prio = effective_prio(p);
-        delta = p->prio - old_prio;
-        if (on_rq) {
-                enqueue_task(rq, p, 0);
-                /*
-                 * If the task increased its priority or is running and
-                 * lowered its priority, then reschedule its CPU:
-                 */
-                if (delta < 0 || (delta > 0 && task_running(rq, p)))
-                        resched_task(rq->curr);
-        }
-out_unlock:
-        task_rq_unlock(rq, p, &flags);
-}
-EXPORT_SYMBOL(set_user_nice);
-/*
- * can_nice - check if a task can reduce its nice value
- * @p: task
- * @nice: nice value
- */
-int can_nice(const struct task_struct *p, const int nice)
-{
-        /* convert nice value [19,-20] to rlimit style value [1,40] */
-        int nice_rlim = 20 - nice;
-        return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
-                capable(CAP_SYS_NICE));
-}
-#ifdef __ARCH_WANT_SYS_NICE
-/*
- * sys_nice - change the priority of the current process.
- * @increment: priority increment
- *
- * sys_setpriority is a more generic, but much slower function that
- * does similar things.
- */
-SYSCALL_DEFINE1(nice, int, increment)
-{
-        long nice, retval;
-        /*
-         * Setpriority might change our priority at the same moment.
-         * We don't have to worry. Conceptually one call occurs first
-         * and we have a single winner.
-         */
-        if (increment < -40)
-                increment = -40;
-        if (increment > 40)
-                increment = 40;
-        nice = TASK_NICE(current) + increment;
-        if (nice < -20)
-                nice = -20;
-        if (nice > 19)
-                nice = 19;
-        if (increment < 0 && !can_nice(current, nice))
-                return -EPERM;
-        retval = security_task_setnice(current, nice);
-        if (retval)
-                return retval;
-        set_user_nice(current, nice);
-        return 0;
-}
-#endif
-/**
- * task_prio - return the priority value of a given task.
- * @p: the task in question.
- *
- * This is the priority value as seen by users in /proc.
- * RT tasks are offset by -200. Normal tasks are centered
- * around 0, value goes from -16 to +15.
- */
-int task_prio(const struct task_struct *p)
-{
-        return p->prio - MAX_RT_PRIO;
-}
-/**
- * task_nice - return the nice value of a given task.
- * @p: the task in question.
- */
-int task_nice(const struct task_struct *p)
-{
-        return TASK_NICE(p);
-}
-EXPORT_SYMBOL(task_nice);
-/**
- * idle_cpu - is a given cpu idle currently?
- * @cpu: the processor in question.
- */
-int idle_cpu(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        if (rq->curr != rq->idle)
-                return 0;
-        if (rq->nr_running)
-                return 0;
-#ifdef CONFIG_SMP
-        if (!llist_empty(&rq->wake_list))
-                return 0;
-#endif
-        return 1;
-}
-/**
- * idle_task - return the idle task for a given cpu.
- * @cpu: the processor in question.
- */
-struct task_struct *idle_task(int cpu)
-{
-        return cpu_rq(cpu)->idle;
-}
-/**
- * find_process_by_pid - find a process with a matching PID value.
- * @pid: the pid in question.
- */
-static struct task_struct *find_process_by_pid(pid_t pid)
-{
-        return pid ? find_task_by_vpid(pid) : current;
-}
-/* Actually do priority change: must hold rq lock. */
-static void
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
-{
-        p->policy = policy;
-        p->rt_priority = prio;
-        p->normal_prio = normal_prio(p);
-        /* we are holding p->pi_lock already */
-        p->prio = rt_mutex_getprio(p);
-        if (rt_prio(p->prio))
-                p->sched_class = &rt_sched_class;
-        else
-                p->sched_class = &fair_sched_class;
-        set_load_weight(p);
-}
-/*
- * check the target process has a UID that matches the current process's
- */
-static bool check_same_owner(struct task_struct *p)
-{
-        const struct cred *cred = current_cred(), *pcred;
-        bool match;
-        rcu_read_lock();
-        pcred = __task_cred(p);
-        match = (uid_eq(cred->euid, pcred->euid) ||
-                 uid_eq(cred->euid, pcred->uid));
-        rcu_read_unlock();
-        return match;
-}
-static int __sched_setscheduler(struct task_struct *p, int policy,
-                                const struct sched_param *param, bool user)
-{
-        int retval, oldprio, oldpolicy = -1, on_rq, running;
-        unsigned long flags;
-        const struct sched_class *prev_class;
-        struct rq *rq;
-        int reset_on_fork;
-        /* may grab non-irq protected spin_locks */
-        BUG_ON(in_interrupt());
-recheck:
-        /* double check policy once rq lock held */
-        if (policy < 0) {
-                reset_on_fork = p->sched_reset_on_fork;
-                policy = oldpolicy = p->policy;
-        } else {
-                reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
-                policy &= ~SCHED_RESET_ON_FORK;
-                if (policy != SCHED_FIFO && policy != SCHED_RR &&
-                                policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                                policy != SCHED_IDLE)
-                        return -EINVAL;
-        }
-        /*
-         * Valid priorities for SCHED_FIFO and SCHED_RR are
-         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
-         * SCHED_BATCH and SCHED_IDLE is 0.
-         */
-        if (param->sched_priority < 0 ||
-            (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
-            (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
-                return -EINVAL;
-        if (rt_policy(policy) != (param->sched_priority != 0))
-                return -EINVAL;
-        /*
-         * Allow unprivileged RT tasks to decrease priority:
-         */
-        if (user && !capable(CAP_SYS_NICE)) {
-                if (rt_policy(policy)) {
-                        unsigned long rlim_rtprio =
-                                        task_rlimit(p, RLIMIT_RTPRIO);
-                        /* can't set/change the rt policy */
-                        if (policy != p->policy && !rlim_rtprio)
-                                return -EPERM;
-                        /* can't increase priority */
-                        if (param->sched_priority > p->rt_priority &&
-                            param->sched_priority > rlim_rtprio)
-                                return -EPERM;
-                }
-                /*
-                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
-                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
-                 */
-                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
-                        if (!can_nice(p, TASK_NICE(p)))
-                                return -EPERM;
-                }
-                /* can't change other user's priorities */
-                if (!check_same_owner(p))
-                        return -EPERM;
-                /* Normal users shall not reset the sched_reset_on_fork flag */
-                if (p->sched_reset_on_fork && !reset_on_fork)
-                        return -EPERM;
-        }
-        if (user) {
-                retval = security_task_setscheduler(p);
-                if (retval)
-                        return retval;
-        }
-        /*
-         * make sure no PI-waiters arrive (or leave) while we are
-         * changing the priority of the task:
-         *
-         * To be able to change p->policy safely, the appropriate
-         * runqueue lock must be held.
-         */
-        rq = task_rq_lock(p, &flags);
-        /*
-         * Changing the policy of the stop threads its a very bad idea
-         */
-        if (p == rq->stop) {
-                task_rq_unlock(rq, p, &flags);
-                return -EINVAL;
-        }
-        /*
-         * If not changing anything there's no need to proceed further:
-         */
-        if (unlikely(policy == p->policy && (!rt_policy(policy) ||
-                        param->sched_priority == p->rt_priority))) {
-                task_rq_unlock(rq, p, &flags);
-                return 0;
-        }
-#ifdef CONFIG_RT_GROUP_SCHED
-        if (user) {
-                /*
-                 * Do not allow realtime tasks into groups that have no runtime
-                 * assigned.
-                 */
-                if (rt_bandwidth_enabled() && rt_policy(policy) &&
-                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
-                                !task_group_is_autogroup(task_group(p))) {
-                        task_rq_unlock(rq, p, &flags);
-                        return -EPERM;
-                }
-        }
-#endif
-        /* recheck policy now with rq lock held */
-        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
-                policy = oldpolicy = -1;
-                task_rq_unlock(rq, p, &flags);
-                goto recheck;
-        }
-        on_rq = p->on_rq;
-        running = task_current(rq, p);
-        if (on_rq)
-                dequeue_task(rq, p, 0);
-        if (running)
-                p->sched_class->put_prev_task(rq, p);
-        p->sched_reset_on_fork = reset_on_fork;
-        oldprio = p->prio;
-        prev_class = p->sched_class;
-        __setscheduler(rq, p, policy, param->sched_priority);
-        if (running)
-                p->sched_class->set_curr_task(rq);
-        if (on_rq)
-                enqueue_task(rq, p, 0);
-        check_class_changed(rq, p, prev_class, oldprio);
-        task_rq_unlock(rq, p, &flags);
-        rt_mutex_adjust_pi(p);
-        return 0;
-}
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
-                       const struct sched_param *param)
-{
-        return __sched_setscheduler(p, policy, param, true);
-}
-EXPORT_SYMBOL_GPL(sched_setscheduler);
-/**
- * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Just like sched_setscheduler, only don't bother checking if the
- * current context has permission.  For example, this is needed in
- * stop_machine(): we create temporary high priority worker threads,
- * but our caller might not have that capability.
- */
-int sched_setscheduler_nocheck(struct task_struct *p, int policy,
-                               const struct sched_param *param)
-{
-        return __sched_setscheduler(p, policy, param, false);
-}
-static int
-do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
-{
-        struct sched_param lparam;
-        struct task_struct *p;
-        int retval;
-        if (!param || pid < 0)
-                return -EINVAL;
-        if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
-                return -EFAULT;
-        rcu_read_lock();
-        retval = -ESRCH;
-        p = find_process_by_pid(pid);
-        if (p != NULL)
-                retval = sched_setscheduler(p, policy, &lparam);
-        rcu_read_unlock();
-        return retval;
-}
-/**
- * sys_sched_setscheduler - set/change the scheduler policy and RT priority
- * @pid: the pid in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- */
-SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
-                struct sched_param __user *, param)
-{
-        /* negative values for policy are not valid */
-        if (policy < 0)
-                return -EINVAL;
-        return do_sched_setscheduler(pid, policy, param);
-}
-/**
- * sys_sched_setparam - set/change the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the new RT priority.
- */
-SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
-{
-        return do_sched_setscheduler(pid, -1, param);
-}
-/**
- * sys_sched_getscheduler - get the policy (scheduling class) of a thread
- * @pid: the pid in question.
- */
-SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
-{
-        struct task_struct *p;
-        int retval;
-        if (pid < 0)
-                return -EINVAL;
-        retval = -ESRCH;
-        rcu_read_lock();
-        p = find_process_by_pid(pid);
-        if (p) {
-                retval = security_task_getscheduler(p);
-                if (!retval)
-                        retval = p->policy
-                                | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
-        }
-        rcu_read_unlock();
-        return retval;
-}
-/**
- * sys_sched_getparam - get the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the RT priority.
- */
-SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
-{
-        struct sched_param lp;
-        struct task_struct *p;
-        int retval;
-        if (!param || pid < 0)
-                return -EINVAL;
-        rcu_read_lock();
-        p = find_process_by_pid(pid);
-        retval = -ESRCH;
-        if (!p)
-                goto out_unlock;
-        retval = security_task_getscheduler(p);
-        if (retval)
-                goto out_unlock;
-        lp.sched_priority = p->rt_priority;
-        rcu_read_unlock();
-        /*
-         * This one might sleep, we cannot do it with a spinlock held ...
-         */
-        retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-        return retval;
-out_unlock:
-        rcu_read_unlock();
-        return retval;
-}
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
-{
-        cpumask_var_t cpus_allowed, new_mask;
-        struct task_struct *p;
-        int retval;
-        get_online_cpus();
-        rcu_read_lock();
-        p = find_process_by_pid(pid);
-        if (!p) {
-                rcu_read_unlock();
-                put_online_cpus();
-                return -ESRCH;
-        }
-        /* Prevent p going away */
-        get_task_struct(p);
-        rcu_read_unlock();
-        if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
-                retval = -ENOMEM;
-                goto out_put_task;
-        }
-        if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
-                retval = -ENOMEM;
-                goto out_free_cpus_allowed;
-        }
-        retval = -EPERM;
-        if (!check_same_owner(p)) {
-                rcu_read_lock();
-                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
-                        rcu_read_unlock();
-                        goto out_unlock;
-                }
-                rcu_read_unlock();
-        }
-        retval = security_task_setscheduler(p);
-        if (retval)
-                goto out_unlock;
-        cpuset_cpus_allowed(p, cpus_allowed);
-        cpumask_and(new_mask, in_mask, cpus_allowed);
-again:
-        retval = set_cpus_allowed_ptr(p, new_mask);
-        if (!retval) {
-                cpuset_cpus_allowed(p, cpus_allowed);
-                if (!cpumask_subset(new_mask, cpus_allowed)) {
-                        /*
-                         * We must have raced with a concurrent cpuset
-                         * update. Just reset the cpus_allowed to the
-                         * cpuset's cpus_allowed
-                         */
-                        cpumask_copy(new_mask, cpus_allowed);
-                        goto again;
-                }
-        }
-out_unlock:
-        free_cpumask_var(new_mask);
-out_free_cpus_allowed:
-        free_cpumask_var(cpus_allowed);
-out_put_task:
-        put_task_struct(p);
-        put_online_cpus();
-        return retval;
-}
-static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
-                             struct cpumask *new_mask)
-{
-        if (len < cpumask_size())
-                cpumask_clear(new_mask);
-        else if (len > cpumask_size())
-                len = cpumask_size();
-        return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
-}
-/**
- * sys_sched_setaffinity - set the cpu affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new cpu mask
- */
-SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
-                unsigned long __user *, user_mask_ptr)
-{
-        cpumask_var_t new_mask;
-        int retval;
-        if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
-                return -ENOMEM;
-        retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
-        if (retval == 0)
-                retval = sched_setaffinity(pid, new_mask);
-        free_cpumask_var(new_mask);
-        return retval;
-}
-long sched_getaffinity(pid_t pid, struct cpumask *mask)
-{
-        struct task_struct *p;
-        unsigned long flags;
-        int retval;
-        get_online_cpus();
-        rcu_read_lock();
-        retval = -ESRCH;
-        p = find_process_by_pid(pid);
-        if (!p)
-                goto out_unlock;
-        retval = security_task_getscheduler(p);
-        if (retval)
-                goto out_unlock;
-        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
-        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-out_unlock:
-        rcu_read_unlock();
-        put_online_cpus();
-        return retval;
-}
-/**
- * sys_sched_getaffinity - get the cpu affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to hold the current cpu mask
- */
-SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
-                unsigned long __user *, user_mask_ptr)
-{
-        int ret;
-        cpumask_var_t mask;
-        if ((len * BITS_PER_BYTE) < nr_cpu_ids)
-                return -EINVAL;
-        if (len & (sizeof(unsigned long)-1))
-                return -EINVAL;
-        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-                return -ENOMEM;
-        ret = sched_getaffinity(pid, mask);
-        if (ret == 0) {
-                size_t retlen = min_t(size_t, len, cpumask_size());
-                if (copy_to_user(user_mask_ptr, mask, retlen))
-                        ret = -EFAULT;
-                else
-                        ret = retlen;
-        }
-        free_cpumask_var(mask);
-        return ret;
-}
-/**
- * sys_sched_yield - yield the current processor to other threads.
- *
- * This function yields the current CPU to other tasks. If there are no
- * other threads running on this CPU then this function will return.
- */
-SYSCALL_DEFINE0(sched_yield)
-{
-        struct rq *rq = this_rq_lock();
-        schedstat_inc(rq, yld_count);
-        current->sched_class->yield_task(rq);
-        /*
-         * Since we are going to call schedule() anyway, there's
-         * no need to preempt or enable interrupts:
-         */
-        __release(rq->lock);
-        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-        do_raw_spin_unlock(&rq->lock);
-        sched_preempt_enable_no_resched();
-        schedule();
-        return 0;
-}
-static inline int should_resched(void)
-{
-        return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
-}
-static void __cond_resched(void)
-{
-        add_preempt_count(PREEMPT_ACTIVE);
-        __schedule();
-        sub_preempt_count(PREEMPT_ACTIVE);
-}
-int __sched _cond_resched(void)
-{
-        if (should_resched()) {
-                __cond_resched();
-                return 1;
-        }
-        return 0;
-}
-EXPORT_SYMBOL(_cond_resched);
-/*
- * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
- * call schedule, and on return reacquire the lock.
- *
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
- * operations here to prevent schedule() from being called twice (once via
- * spin_unlock(), once by hand).
- */
-int __cond_resched_lock(spinlock_t *lock)
-{
-        int resched = should_resched();
-        int ret = 0;
-        lockdep_assert_held(lock);
-        if (spin_needbreak(lock) || resched) {
-                spin_unlock(lock);
-                if (resched)
-                        __cond_resched();
-                else
-                        cpu_relax();
-                ret = 1;
-                spin_lock(lock);
-        }
-        return ret;
-}
-EXPORT_SYMBOL(__cond_resched_lock);
-int __sched __cond_resched_softirq(void)
-{
-        BUG_ON(!in_softirq());
-        if (should_resched()) {
-                local_bh_enable();
-                __cond_resched();
-                local_bh_disable();
-                return 1;
-        }
-        return 0;
-}
-EXPORT_SYMBOL(__cond_resched_softirq);
-/**
- * yield - yield the current processor to other threads.
- *
- * Do not ever use this function, there's a 99% chance you're doing it wrong.
- *
- * The scheduler is at all times free to pick the calling task as the most
- * eligible task to run, if removing the yield() call from your code breaks
- * it, its already broken.
- *
- * Typical broken usage is:
- *
- * while (!event)
- *      yield();
- *
- * where one assumes that yield() will let 'the other' process run that will
- * make event true. If the current task is a SCHED_FIFO task that will never
- * happen. Never use yield() as a progress guarantee!!
- *
- * If you want to use yield() to wait for something, use wait_event().
- * If you want to use yield() to be 'nice' for others, use cond_resched().
- * If you still want to use yield(), do not!
- */
-void __sched yield(void)
-{
-        set_current_state(TASK_RUNNING);
-        sys_sched_yield();
-}
-EXPORT_SYMBOL(yield);
-/**
- * yield_to - yield the current processor to another thread in
- * your thread group, or accelerate that thread toward the
- * processor it's on.
- * @p: target task
- * @preempt: whether task preemption is allowed or not
- *
- * It's the caller's job to ensure that the target task struct
- * can't go away on us before we can do any checks.
- *
- * Returns true if we indeed boosted the target task.
- */
-bool __sched yield_to(struct task_struct *p, bool preempt)
-{
-        struct task_struct *curr = current;
-        struct rq *rq, *p_rq;
-        unsigned long flags;
-        bool yielded = 0;
-        local_irq_save(flags);
-        rq = this_rq();
-again:
-        p_rq = task_rq(p);
-        double_rq_lock(rq, p_rq);
-        while (task_rq(p) != p_rq) {
-                double_rq_unlock(rq, p_rq);
-                goto again;
-        }
-        if (!curr->sched_class->yield_to_task)
-                goto out;
-        if (curr->sched_class != p->sched_class)
-                goto out;
-        if (task_running(p_rq, p) || p->state)
-                goto out;
-        yielded = curr->sched_class->yield_to_task(rq, p, preempt);
-        if (yielded) {
-                schedstat_inc(rq, yld_count);
-                /*
-                 * Make p's CPU reschedule; pick_next_entity takes care of
-                 * fairness.
-                 */
-                if (preempt && rq != p_rq)
-                        resched_task(p_rq->curr);
-        }
-out:
-        double_rq_unlock(rq, p_rq);
-        local_irq_restore(flags);
-        if (yielded)
-                schedule();
-        return yielded;
-}
-EXPORT_SYMBOL_GPL(yield_to);
-/*
- * This task is about to go to sleep on IO. Increment rq->nr_iowait so
- * that process accounting knows that this is a task in IO wait state.
- */
-void __sched io_schedule(void)
-{
-        struct rq *rq = raw_rq();
-        delayacct_blkio_start();
-        atomic_inc(&rq->nr_iowait);
-        blk_flush_plug(current);
-        current->in_iowait = 1;
-        schedule();
-        current->in_iowait = 0;
-        atomic_dec(&rq->nr_iowait);
-        delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-long __sched io_schedule_timeout(long timeout)
-{
-        struct rq *rq = raw_rq();
-        long ret;
-        delayacct_blkio_start();
-        atomic_inc(&rq->nr_iowait);
-        blk_flush_plug(current);
-        current->in_iowait = 1;
-        ret = schedule_timeout(timeout);
-        current->in_iowait = 0;
-        atomic_dec(&rq->nr_iowait);
-        delayacct_blkio_end();
-        return ret;
-}
-/**
- * sys_sched_get_priority_max - return maximum RT priority.
- * @policy: scheduling class.
- *
- * this syscall returns the maximum rt_priority that can be used
- * by a given scheduling class.
- */
-SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
-{
-        int ret = -EINVAL;
-        switch (policy) {
-        case SCHED_FIFO:
-        case SCHED_RR:
-                ret = MAX_USER_RT_PRIO-1;
-                break;
-        case SCHED_NORMAL:
-        case SCHED_BATCH:
-        case SCHED_IDLE:
-                ret = 0;
-                break;
-        }
-        return ret;
-}
-/**
- * sys_sched_get_priority_min - return minimum RT priority.
- * @policy: scheduling class.
- *
- * this syscall returns the minimum rt_priority that can be used
- * by a given scheduling class.
- */
-SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
-{
-        int ret = -EINVAL;
-        switch (policy) {
-        case SCHED_FIFO:
-        case SCHED_RR:
-                ret = 1;
-                break;
-        case SCHED_NORMAL:
-        case SCHED_BATCH:
-        case SCHED_IDLE:
-                ret = 0;
-        }
-        return ret;
-}
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- * this syscall writes the default timeslice value of a given process
- * into the user-space timespec buffer. A value of '0' means infinity.
- */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
-                struct timespec __user *, interval)
-{
-        struct task_struct *p;
-        unsigned int time_slice;
-        unsigned long flags;
-        struct rq *rq;
-        int retval;
-        struct timespec t;
-        if (pid < 0)
-                return -EINVAL;
-        retval = -ESRCH;
-        rcu_read_lock();
-        p = find_process_by_pid(pid);
-        if (!p)
-                goto out_unlock;
-        retval = security_task_getscheduler(p);
-        if (retval)
-                goto out_unlock;
-        rq = task_rq_lock(p, &flags);
-        time_slice = p->sched_class->get_rr_interval(rq, p);
-        task_rq_unlock(rq, p, &flags);
-        rcu_read_unlock();
-        jiffies_to_timespec(time_slice, &t);
-        retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
-        return retval;
-out_unlock:
-        rcu_read_unlock();
-        return retval;
-}
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-void sched_show_task(struct task_struct *p)
-{
-        unsigned long free = 0;
-        int ppid;
-        unsigned state;
-        state = p->state ? __ffs(p->state) + 1 : 0;
-        printk(KERN_INFO "%-15.15s %c", p->comm,
-                state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
-        if (state == TASK_RUNNING)
-                printk(KERN_CONT " running  ");
-        else
-                printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
-        if (state == TASK_RUNNING)
-                printk(KERN_CONT "  running task    ");
-        else
-                printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
-#ifdef CONFIG_DEBUG_STACK_USAGE
-        free = stack_not_used(p);
-#endif
-        rcu_read_lock();
-        ppid = task_pid_nr(rcu_dereference(p->real_parent));
-        rcu_read_unlock();
-        printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-                task_pid_nr(p), ppid,
-                (unsigned long)task_thread_info(p)->flags);
-        show_stack(p, NULL);
-}
-void show_state_filter(unsigned long state_filter)
-{
-        struct task_struct *g, *p;
-#if BITS_PER_LONG == 32
-        printk(KERN_INFO
-                "  task                PC stack   pid father\n");
-#else
-        printk(KERN_INFO
-                "  task                        PC stack   pid father\n");
-#endif
-        rcu_read_lock();
-        do_each_thread(g, p) {
-                /*
-                 * reset the NMI-timeout, listing all files on a slow
-                 * console might take a lot of time:
-                 */
-                touch_nmi_watchdog();
-                if (!state_filter || (p->state & state_filter))
-                        sched_show_task(p);
-        } while_each_thread(g, p);
-        touch_all_softlockup_watchdogs();
-#ifdef CONFIG_SCHED_DEBUG
-        sysrq_sched_debug_show();
-#endif
-        rcu_read_unlock();
-        /*
-         * Only show locks if all tasks are dumped:
-         */
-        if (!state_filter)
-                debug_show_all_locks();
-}
-void __cpuinit init_idle_bootup_task(struct task_struct *idle)
-{
-        idle->sched_class = &idle_sched_class;
-}
-/**
- * init_idle - set up an idle thread for a given CPU
- * @idle: task in question
- * @cpu: cpu the idle task belongs to
- *
- * NOTE: this function does not set the idle thread's NEED_RESCHED
- * flag, to make booting more robust.
- */
-void __cpuinit init_idle(struct task_struct *idle, int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long flags;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        __sched_fork(idle);
-        idle->state = TASK_RUNNING;
-        idle->se.exec_start = sched_clock();
-        do_set_cpus_allowed(idle, cpumask_of(cpu));
-        /*
-         * We're having a chicken and egg problem, even though we are
-         * holding rq->lock, the cpu isn't yet set to this cpu so the
-         * lockdep check in task_group() will fail.
-         *
-         * Similar case to sched_fork(). / Alternatively we could
-         * use task_rq_lock() here and obtain the other rq->lock.
-         *
-         * Silence PROVE_RCU
-         */
-        rcu_read_lock();
-        __set_task_cpu(idle, cpu);
-        rcu_read_unlock();
-        rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP)
-        idle->on_cpu = 1;
-#endif
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-        /* Set the preempt count _outside_ the spinlocks! */
-        task_thread_info(idle)->preempt_count = 0;
-        /*
-         * The idle tasks have their own, simple scheduling class:
-         */
-        idle->sched_class = &idle_sched_class;
-        ftrace_graph_init_idle_task(idle, cpu);
-#if defined(CONFIG_SMP)
-        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
-#endif
-}
-#ifdef CONFIG_SMP
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-        if (p->sched_class && p->sched_class->set_cpus_allowed)
-                p->sched_class->set_cpus_allowed(p, new_mask);
-        cpumask_copy(&p->cpus_allowed, new_mask);
-        p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- *    stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- *    off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- *    it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- *    is done.
- */
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
-        unsigned long flags;
-        struct rq *rq;
-        unsigned int dest_cpu;
-        int ret = 0;
-        rq = task_rq_lock(p, &flags);
-        if (cpumask_equal(&p->cpus_allowed, new_mask))
-                goto out;
-        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
-                ret = -EINVAL;
-                goto out;
-        }
-        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
-                ret = -EINVAL;
-                goto out;
-        }
-        do_set_cpus_allowed(p, new_mask);
-        /* Can the task run on the task's current CPU? If so, we're done */
-        if (cpumask_test_cpu(task_cpu(p), new_mask))
-                goto out;
-        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-        if (p->on_rq) {
-                struct migration_arg arg = { p, dest_cpu };
-                /* Need help from migration thread: drop lock and wait. */
-                task_rq_unlock(rq, p, &flags);
-                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-                tlb_migrate_finish(p->mm);
-                return 0;
-        }
-out:
-        task_rq_unlock(rq, p, &flags);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-        struct rq *rq_dest, *rq_src;
-        int ret = 0;
-        if (unlikely(!cpu_active(dest_cpu)))
-                return ret;
-        rq_src = cpu_rq(src_cpu);
-        rq_dest = cpu_rq(dest_cpu);
-        raw_spin_lock(&p->pi_lock);
-        double_rq_lock(rq_src, rq_dest);
-        /* Already moved. */
-        if (task_cpu(p) != src_cpu)
-                goto done;
-        /* Affinity changed (again). */
-        if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-                goto fail;
-        /*
-         * If we're not on a rq, the next wake-up will ensure we're
-         * placed properly.
-         */
-        if (p->on_rq) {
-                dequeue_task(rq_src, p, 0);
-                set_task_cpu(p, dest_cpu);
-                enqueue_task(rq_dest, p, 0);
-                check_preempt_curr(rq_dest, p, 0);
-        }
-done:
-        ret = 1;
-fail:
-        double_rq_unlock(rq_src, rq_dest);
-        raw_spin_unlock(&p->pi_lock);
-        return ret;
-}
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
-        struct migration_arg *arg = data;
-        /*
-         * The original target cpu might have gone down and we might
-         * be on another cpu but it doesn't matter.
-         */
-        local_irq_disable();
-        __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
-        local_irq_enable();
-        return 0;
-}
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
- */
-void idle_task_exit(void)
-{
-        struct mm_struct *mm = current->active_mm;
-        BUG_ON(cpu_online(smp_processor_id()));
-        if (mm != &init_mm)
-                switch_mm(mm, &init_mm, current);
-        mmdrop(mm);
-}
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
-{
-        long delta = calc_load_fold_active(rq);
-        if (delta)
-                atomic_long_add(delta, &calc_load_tasks);
-}
-/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
- *
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
- */
-static void migrate_tasks(unsigned int dead_cpu)
-{
-        struct rq *rq = cpu_rq(dead_cpu);
-        struct task_struct *next, *stop = rq->stop;
-        int dest_cpu;
-        /*
-         * Fudge the rq selection such that the below task selection loop
-         * doesn't get stuck on the currently eligible stop task.
-         *
-         * We're currently inside stop_machine() and the rq is either stuck
-         * in the stop_machine_cpu_stop() loop, or we're executing this code,
-         * either way we should never end up calling schedule() until we're
-         * done here.
-         */
-        rq->stop = NULL;
-        for ( ; ; ) {
-                /*
-                 * There's this thread running, bail when that's the only
-                 * remaining thread.
-                 */
-                if (rq->nr_running == 1)
-                        break;
-                next = pick_next_task(rq);
-                BUG_ON(!next);
-                next->sched_class->put_prev_task(rq, next);
-                /* Find suitable destination for @next, with force if needed. */
-                dest_cpu = select_fallback_rq(dead_cpu, next);
-                raw_spin_unlock(&rq->lock);
-                __migrate_task(next, dead_cpu, dest_cpu);
-                raw_spin_lock(&rq->lock);
-        }
-        rq->stop = stop;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-static struct ctl_table sd_ctl_dir[] = {
-        {
-                .procname       = "sched_domain",
-                .mode           = 0555,
-        },
-        {}
-};
-static struct ctl_table sd_ctl_root[] = {
-        {
-                .procname       = "kernel",
-                .mode           = 0555,
-                .child          = sd_ctl_dir,
-        },
-        {}
-};
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
-        struct ctl_table *entry =
-                kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-        return entry;
-}
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
-        struct ctl_table *entry;
-        /*
-         * In the intermediate directories, both the child directory and
-         * procname are dynamically allocated and could fail but the mode
-         * will always be set. In the lowest directory the names are
-         * static strings and all have proc handlers.
-         */
-        for (entry = *tablep; entry->mode; entry++) {
-                if (entry->child)
-                        sd_free_ctl_entry(&entry->child);
-                if (entry->proc_handler == NULL)
-                        kfree(entry->procname);
-        }
-        kfree(*tablep);
-        *tablep = NULL;
-}
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX;
-static void
-set_table_entry(struct ctl_table *entry,
-                const char *procname, void *data, int maxlen,
-                umode_t mode, proc_handler *proc_handler,
-                bool load_idx)
-{
-        entry->procname = procname;
-        entry->data = data;
-        entry->maxlen = maxlen;
-        entry->mode = mode;
-        entry->proc_handler = proc_handler;
-        if (load_idx) {
-                entry->extra1 = &min_load_idx;
-                entry->extra2 = &max_load_idx;
-        }
-}
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
-        struct ctl_table *table = sd_alloc_ctl_entry(13);
-        if (table == NULL)
-                return NULL;
-        set_table_entry(&table[0], "min_interval", &sd->min_interval,
-                sizeof(long), 0644, proc_doulongvec_minmax, false);
-        set_table_entry(&table[1], "max_interval", &sd->max_interval,
-                sizeof(long), 0644, proc_doulongvec_minmax, false);
-        set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
-                sizeof(int), 0644, proc_dointvec_minmax, true);
-        set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
-                sizeof(int), 0644, proc_dointvec_minmax, true);
-        set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
-                sizeof(int), 0644, proc_dointvec_minmax, true);
-        set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
-                sizeof(int), 0644, proc_dointvec_minmax, true);
-        set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
-                sizeof(int), 0644, proc_dointvec_minmax, true);
-        set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-                sizeof(int), 0644, proc_dointvec_minmax, false);
-        set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-                sizeof(int), 0644, proc_dointvec_minmax, false);
-        set_table_entry(&table[9], "cache_nice_tries",
-                &sd->cache_nice_tries,
-                sizeof(int), 0644, proc_dointvec_minmax, false);
-        set_table_entry(&table[10], "flags", &sd->flags,
-                sizeof(int), 0644, proc_dointvec_minmax, false);
-        set_table_entry(&table[11], "name", sd->name,
-                CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-        /* &table[12] is terminator */
-        return table;
-}
-static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
-        struct ctl_table *entry, *table;
-        struct sched_domain *sd;
-        int domain_num = 0, i;
-        char buf[32];
-        for_each_domain(cpu, sd)
-                domain_num++;
-        entry = table = sd_alloc_ctl_entry(domain_num + 1);
-        if (table == NULL)
-                return NULL;
-        i = 0;
-        for_each_domain(cpu, sd) {
-                snprintf(buf, 32, "domain%d", i);
-                entry->procname = kstrdup(buf, GFP_KERNEL);
-                entry->mode = 0555;
-                entry->child = sd_alloc_ctl_domain_table(sd);
-                entry++;
-                i++;
-        }
-        return table;
-}
-static struct ctl_table_header *sd_sysctl_header;
-static void register_sched_domain_sysctl(void)
-{
-        int i, cpu_num = num_possible_cpus();
-        struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
-        char buf[32];
-        WARN_ON(sd_ctl_dir[0].child);
-        sd_ctl_dir[0].child = entry;
-        if (entry == NULL)
-                return;
-        for_each_possible_cpu(i) {
-                snprintf(buf, 32, "cpu%d", i);
-                entry->procname = kstrdup(buf, GFP_KERNEL);
-                entry->mode = 0555;
-                entry->child = sd_alloc_ctl_cpu_table(i);
-                entry++;
-        }
-        WARN_ON(sd_sysctl_header);
-        sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-/* may be called multiple times per register */
-static void unregister_sched_domain_sysctl(void)
-{
-        if (sd_sysctl_header)
-                unregister_sysctl_table(sd_sysctl_header);
-        sd_sysctl_header = NULL;
-        if (sd_ctl_dir[0].child)
-                sd_free_ctl_entry(&sd_ctl_dir[0].child);
-}
-#else
-static void register_sched_domain_sysctl(void)
-{
-}
-static void unregister_sched_domain_sysctl(void)
-{
-}
-#endif
-static void set_rq_online(struct rq *rq)
-{
-        if (!rq->online) {
-                const struct sched_class *class;
-                cpumask_set_cpu(rq->cpu, rq->rd->online);
-                rq->online = 1;
-                for_each_class(class) {
-                        if (class->rq_online)
-                                class->rq_online(rq);
-                }
-        }
-}
-static void set_rq_offline(struct rq *rq)
-{
-        if (rq->online) {
-                const struct sched_class *class;
-                for_each_class(class) {
-                        if (class->rq_offline)
-                                class->rq_offline(rq);
-                }
-                cpumask_clear_cpu(rq->cpu, rq->rd->online);
-                rq->online = 0;
-        }
-}
-/*
- * migration_call - callback that gets triggered when a CPU is added.
- * Here we can start up the necessary migration thread for the new CPU.
- */
-static int __cpuinit
-migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-        int cpu = (long)hcpu;
-        unsigned long flags;
-        struct rq *rq = cpu_rq(cpu);
-        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_UP_PREPARE:
-                rq->calc_load_update = calc_load_update;
-                break;
-        case CPU_ONLINE:
-                /* Update our root-domain */
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                if (rq->rd) {
-                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-                        set_rq_online(rq);
-                }
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-                break;
-#ifdef CONFIG_HOTPLUG_CPU
-        case CPU_DYING:
-                sched_ttwu_pending();
-                /* Update our root-domain */
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                if (rq->rd) {
-                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-                        set_rq_offline(rq);
-                }
-                migrate_tasks(cpu);
-                BUG_ON(rq->nr_running != 1); /* the migration thread */
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-                break;
-        case CPU_DEAD:
-                calc_load_migrate(rq);
-                break;
-#endif
-        }
-        update_max_interval();
-        return NOTIFY_OK;
-}
-/*
- * Register at high priority so that task migration (migrate_all_tasks)
- * happens before everything else.  This has to be lower priority than
- * the notifier in the perf_event subsystem, though.
- */
-static struct notifier_block __cpuinitdata migration_notifier = {
-        .notifier_call = migration_call,
-        .priority = CPU_PRI_MIGRATION,
-};
-static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
-                                      unsigned long action, void *hcpu)
-{
-        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_STARTING:
-        case CPU_DOWN_FAILED:
-                set_cpu_active((long)hcpu, true);
-                return NOTIFY_OK;
-        default:
-                return NOTIFY_DONE;
-        }
-}
-static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
-                                        unsigned long action, void *hcpu)
-{
-        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_DOWN_PREPARE:
-                set_cpu_active((long)hcpu, false);
-                return NOTIFY_OK;
-        default:
-                return NOTIFY_DONE;
-        }
-}
-static int __init migration_init(void)
-{
-        void *cpu = (void *)(long)smp_processor_id();
-        int err;
-        /* Initialize migration for the boot CPU */
-        err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
-        BUG_ON(err == NOTIFY_BAD);
-        migration_call(&migration_notifier, CPU_ONLINE, cpu);
-        register_cpu_notifier(&migration_notifier);
-        /* Register cpu active notifiers */
-        cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
-        cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
-        return 0;
-}
-early_initcall(migration_init);
-#endif
-#ifdef CONFIG_SMP
-static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
-#ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_debug_enabled;
-static int __init sched_debug_setup(char *str)
-{
-        sched_debug_enabled = 1;
-        return 0;
-}
-early_param("sched_debug", sched_debug_setup);
-static inline bool sched_debug(void)
-{
-        return sched_debug_enabled;
-}
-static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-                                  struct cpumask *groupmask)
-{
-        struct sched_group *group = sd->groups;
-        char str[256];
-        cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
-        cpumask_clear(groupmask);
-        printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
-        if (!(sd->flags & SD_LOAD_BALANCE)) {
-                printk("does not load-balance\n");
-                if (sd->parent)
-                        printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
-                                        " has parent");
-                return -1;
-        }
-        printk(KERN_CONT "span %s level %s\n", str, sd->name);
-        if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                printk(KERN_ERR "ERROR: domain->span does not contain "
-                                "CPU%d\n", cpu);
-        }
-        if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
-                printk(KERN_ERR "ERROR: domain->groups does not contain"
-                                " CPU%d\n", cpu);
-        }
-        printk(KERN_DEBUG "%*s groups:", level + 1, "");
-        do {
-                if (!group) {
-                        printk("\n");
-                        printk(KERN_ERR "ERROR: group is NULL\n");
-                        break;
-                }
-                /*
-                 * Even though we initialize ->power to something semi-sane,
-                 * we leave power_orig unset. This allows us to detect if
-                 * domain iteration is still funny without causing /0 traps.
-                 */
-                if (!group->sgp->power_orig) {
-                        printk(KERN_CONT "\n");
-                        printk(KERN_ERR "ERROR: domain->cpu_power not "
-                                        "set\n");
-                        break;
-                }
-                if (!cpumask_weight(sched_group_cpus(group))) {
-                        printk(KERN_CONT "\n");
-                        printk(KERN_ERR "ERROR: empty group\n");
-                        break;
-                }
-                if (!(sd->flags & SD_OVERLAP) &&
-                    cpumask_intersects(groupmask, sched_group_cpus(group))) {
-                        printk(KERN_CONT "\n");
-                        printk(KERN_ERR "ERROR: repeated CPUs\n");
-                        break;
-                }
-                cpumask_or(groupmask, groupmask, sched_group_cpus(group));
-                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-                printk(KERN_CONT " %s", str);
-                if (group->sgp->power != SCHED_POWER_SCALE) {
-                        printk(KERN_CONT " (cpu_power = %d)",
-                                group->sgp->power);
-                }
-                group = group->next;
-        } while (group != sd->groups);
-        printk(KERN_CONT "\n");
-        if (!cpumask_equal(sched_domain_span(sd), groupmask))
-                printk(KERN_ERR "ERROR: groups don't span domain->span\n");
-        if (sd->parent &&
-            !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
-                printk(KERN_ERR "ERROR: parent span is not a superset "
-                        "of domain->span\n");
-        return 0;
-}
-static void sched_domain_debug(struct sched_domain *sd, int cpu)
-{
-        int level = 0;
-        if (!sched_debug_enabled)
-                return;
-        if (!sd) {
-                printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
-                return;
-        }
-        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
-        for (;;) {
-                if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
-                        break;
-                level++;
-                sd = sd->parent;
-                if (!sd)
-                        break;
-        }
-}
-#else /* !CONFIG_SCHED_DEBUG */
-# define sched_domain_debug(sd, cpu) do { } while (0)
-static inline bool sched_debug(void)
-{
-        return false;
-}
-#endif /* CONFIG_SCHED_DEBUG */
-static int sd_degenerate(struct sched_domain *sd)
-{
-        if (cpumask_weight(sched_domain_span(sd)) == 1)
-                return 1;
-        /* Following flags need at least 2 groups */
-        if (sd->flags & (SD_LOAD_BALANCE |
-                         SD_BALANCE_NEWIDLE |
-                         SD_BALANCE_FORK |
-                         SD_BALANCE_EXEC |
-                         SD_SHARE_CPUPOWER |
-                         SD_SHARE_PKG_RESOURCES)) {
-                if (sd->groups != sd->groups->next)
-                        return 0;
-        }
-        /* Following flags don't use groups */
-        if (sd->flags & (SD_WAKE_AFFINE))
-                return 0;
-        return 1;
-}
-static int
-sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
-{
-        unsigned long cflags = sd->flags, pflags = parent->flags;
-        if (sd_degenerate(parent))
-                return 1;
-        if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
-                return 0;
-        /* Flags needing groups don't count if only 1 group in parent */
-        if (parent->groups == parent->groups->next) {
-                pflags &= ~(SD_LOAD_BALANCE |
-                                SD_BALANCE_NEWIDLE |
-                                SD_BALANCE_FORK |
-                                SD_BALANCE_EXEC |
-                                SD_SHARE_CPUPOWER |
-                                SD_SHARE_PKG_RESOURCES);
-                if (nr_node_ids == 1)
-                        pflags &= ~SD_SERIALIZE;
-        }
-        if (~cflags & pflags)
-                return 0;
-        return 1;
-}
-static void free_rootdomain(struct rcu_head *rcu)
-{
-        struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
-        cpupri_cleanup(&rd->cpupri);
-        free_cpumask_var(rd->rto_mask);
-        free_cpumask_var(rd->online);
-        free_cpumask_var(rd->span);
-        kfree(rd);
-}
-static void rq_attach_root(struct rq *rq, struct root_domain *rd)
-{
-        struct root_domain *old_rd = NULL;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        if (rq->rd) {
-                old_rd = rq->rd;
-                if (cpumask_test_cpu(rq->cpu, old_rd->online))
-                        set_rq_offline(rq);
-                cpumask_clear_cpu(rq->cpu, old_rd->span);
-                /*
-                 * If we dont want to free the old_rt yet then
-                 * set old_rd to NULL to skip the freeing later
-                 * in this function:
-                 */
-                if (!atomic_dec_and_test(&old_rd->refcount))
-                        old_rd = NULL;
-        }
-        atomic_inc(&rd->refcount);
-        rq->rd = rd;
-        cpumask_set_cpu(rq->cpu, rd->span);
-        if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
-                set_rq_online(rq);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-        if (old_rd)
-                call_rcu_sched(&old_rd->rcu, free_rootdomain);
-}
-static int init_rootdomain(struct root_domain *rd)
-{
-        memset(rd, 0, sizeof(*rd));
-        if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
-                goto out;
-        if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
-                goto free_span;
-        if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
-                goto free_online;
-        if (cpupri_init(&rd->cpupri) != 0)
-                goto free_rto_mask;
-        return 0;
-free_rto_mask:
-        free_cpumask_var(rd->rto_mask);
-free_online:
-        free_cpumask_var(rd->online);
-free_span:
-        free_cpumask_var(rd->span);
-out:
-        return -ENOMEM;
-}
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-struct root_domain def_root_domain;
-static void init_defrootdomain(void)
-{
-        init_rootdomain(&def_root_domain);
-        atomic_set(&def_root_domain.refcount, 1);
-}
-static struct root_domain *alloc_rootdomain(void)
-{
-        struct root_domain *rd;
-        rd = kmalloc(sizeof(*rd), GFP_KERNEL);
-        if (!rd)
-                return NULL;
-        if (init_rootdomain(rd) != 0) {
-                kfree(rd);
-                return NULL;
-        }
-        return rd;
-}
-static void free_sched_groups(struct sched_group *sg, int free_sgp)
-{
-        struct sched_group *tmp, *first;
-        if (!sg)
-                return;
-        first = sg;
-        do {
-                tmp = sg->next;
-                if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
-                        kfree(sg->sgp);
-                kfree(sg);
-                sg = tmp;
-        } while (sg != first);
-}
-static void free_sched_domain(struct rcu_head *rcu)
-{
-        struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-        /*
-         * If its an overlapping domain it has private groups, iterate and
-         * nuke them all.
-         */
-        if (sd->flags & SD_OVERLAP) {
-                free_sched_groups(sd->groups, 1);
-        } else if (atomic_dec_and_test(&sd->groups->ref)) {
-                kfree(sd->groups->sgp);
-                kfree(sd->groups);
-        }
-        kfree(sd);
-}
-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
-{
-        call_rcu(&sd->rcu, free_sched_domain);
-}
-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
-{
-        for (; sd; sd = sd->parent)
-                destroy_sched_domain(sd, cpu);
-}
-/*
- * Keep a special pointer to the highest sched_domain that has
- * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
- * allows us to avoid some pointer chasing select_idle_sibling().
- *
- * Also keep a unique ID per domain (we use the first cpu number in
- * the cpumask of the domain), this allows us to quickly tell if
- * two cpus are in the same cache domain, see cpus_share_cache().
- */
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
-DEFINE_PER_CPU(int, sd_llc_id);
-static void update_top_cache_domain(int cpu)
-{
-        struct sched_domain *sd;
-        int id = cpu;
-        sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
-        if (sd)
-                id = cpumask_first(sched_domain_span(sd));
-        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
-        per_cpu(sd_llc_id, cpu) = id;
-}
-/*
- * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
- * hold the hotplug lock.
- */
-static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        struct sched_domain *tmp;
-        /* Remove the sched domains which do not contribute to scheduling. */
-        for (tmp = sd; tmp; ) {
-                struct sched_domain *parent = tmp->parent;
-                if (!parent)
-                        break;
-                if (sd_parent_degenerate(tmp, parent)) {
-                        tmp->parent = parent->parent;
-                        if (parent->parent)
-                                parent->parent->child = tmp;
-                        destroy_sched_domain(parent, cpu);
-                } else
-                        tmp = tmp->parent;
-        }
-        if (sd && sd_degenerate(sd)) {
-                tmp = sd;
-                sd = sd->parent;
-                destroy_sched_domain(tmp, cpu);
-                if (sd)
-                        sd->child = NULL;
-        }
-        sched_domain_debug(sd, cpu);
-        rq_attach_root(rq, rd);
-        tmp = rq->sd;
-        rcu_assign_pointer(rq->sd, sd);
-        destroy_sched_domains(tmp, cpu);
-        update_top_cache_domain(cpu);
-}
-/* cpus with isolated domains */
-static cpumask_var_t cpu_isolated_map;
-/* Setup the mask of cpus configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
-        alloc_bootmem_cpumask_var(&cpu_isolated_map);
-        cpulist_parse(str, cpu_isolated_map);
-        return 1;
-}
-__setup("isolcpus=", isolated_cpu_setup);
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
-        return cpumask_of_node(cpu_to_node(cpu));
-}
-struct sd_data {
-        struct sched_domain **__percpu sd;
-        struct sched_group **__percpu sg;
-        struct sched_group_power **__percpu sgp;
-};
-struct s_data {
-        struct sched_domain ** __percpu sd;
-        struct root_domain      *rd;
-};
-enum s_alloc {
-        sa_rootdomain,
-        sa_sd,
-        sa_sd_storage,
-        sa_none,
-};
-struct sched_domain_topology_level;
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-#define SDTL_OVERLAP    0x01
-struct sched_domain_topology_level {
-        sched_domain_init_f init;
-        sched_domain_mask_f mask;
-        int                 flags;
-        int                 numa_level;
-        struct sd_data      data;
-};
-/*
- * Build an iteration mask that can exclude certain CPUs from the upwards
- * domain traversal.
- *
- * Asymmetric node setups can result in situations where the domain tree is of
- * unequal depth, make sure to skip domains that already cover the entire
- * range.
- *
- * In that case build_sched_domains() will have terminated the iteration early
- * and our sibling sd spans will be empty. Domains should always include the
- * cpu they're built on, so check that.
- *
- */
-static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
-{
-        const struct cpumask *span = sched_domain_span(sd);
-        struct sd_data *sdd = sd->private;
-        struct sched_domain *sibling;
-        int i;
-        for_each_cpu(i, span) {
-                sibling = *per_cpu_ptr(sdd->sd, i);
-                if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
-                        continue;
-                cpumask_set_cpu(i, sched_group_mask(sg));
-        }
-}
-/*
- * Return the canonical balance cpu for this group, this is the first cpu
- * of this group that's also in the iteration mask.
- */
-int group_balance_cpu(struct sched_group *sg)
-{
-        return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
-}
-static int
-build_overlap_sched_groups(struct sched_domain *sd, int cpu)
-{
-        struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
-        const struct cpumask *span = sched_domain_span(sd);
-        struct cpumask *covered = sched_domains_tmpmask;
-        struct sd_data *sdd = sd->private;
-        struct sched_domain *child;
-        int i;
-        cpumask_clear(covered);
-        for_each_cpu(i, span) {
-                struct cpumask *sg_span;
-                if (cpumask_test_cpu(i, covered))
-                        continue;
-                child = *per_cpu_ptr(sdd->sd, i);
-                /* See the comment near build_group_mask(). */
-                if (!cpumask_test_cpu(i, sched_domain_span(child)))
-                        continue;
-                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                                GFP_KERNEL, cpu_to_node(cpu));
-                if (!sg)
-                        goto fail;
-                sg_span = sched_group_cpus(sg);
-                if (child->child) {
-                        child = child->child;
-                        cpumask_copy(sg_span, sched_domain_span(child));
-                } else
-                        cpumask_set_cpu(i, sg_span);
-                cpumask_or(covered, covered, sg_span);
-                sg->sgp = *per_cpu_ptr(sdd->sgp, i);
-                if (atomic_inc_return(&sg->sgp->ref) == 1)
-                        build_group_mask(sd, sg);
-                /*
-                 * Initialize sgp->power such that even if we mess up the
-                 * domains and no possible iteration will get us here, we won't
-                 * die on a /0 trap.
-                 */
-                sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
-                /*
-                 * Make sure the first group of this domain contains the
-                 * canonical balance cpu. Otherwise the sched_domain iteration
-                 * breaks. See update_sg_lb_stats().
-                 */
-                if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
-                    group_balance_cpu(sg) == cpu)
-                        groups = sg;
-                if (!first)
-                        first = sg;
-                if (last)
-                        last->next = sg;
-                last = sg;
-                last->next = first;
-        }
-        sd->groups = groups;
-        return 0;
-fail:
-        free_sched_groups(first, 0);
-        return -ENOMEM;
-}
-static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
-{
-        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
-        struct sched_domain *child = sd->child;
-        if (child)
-                cpu = cpumask_first(sched_domain_span(child));
-        if (sg) {
-                *sg = *per_cpu_ptr(sdd->sg, cpu);
-                (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
-                atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
-        }
-        return cpu;
-}
-/*
- * build_sched_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
- *
- * Assumes the sched_domain tree is fully constructed
- */
-static int
-build_sched_groups(struct sched_domain *sd, int cpu)
-{
-        struct sched_group *first = NULL, *last = NULL;
-        struct sd_data *sdd = sd->private;
-        const struct cpumask *span = sched_domain_span(sd);
-        struct cpumask *covered;
-        int i;
-        get_group(cpu, sdd, &sd->groups);
-        atomic_inc(&sd->groups->ref);
-        if (cpu != cpumask_first(sched_domain_span(sd)))
-                return 0;
-        lockdep_assert_held(&sched_domains_mutex);
-        covered = sched_domains_tmpmask;
-        cpumask_clear(covered);
-        for_each_cpu(i, span) {
-                struct sched_group *sg;
-                int group = get_group(i, sdd, &sg);
-                int j;
-                if (cpumask_test_cpu(i, covered))
-                        continue;
-                cpumask_clear(sched_group_cpus(sg));
-                sg->sgp->power = 0;
-                cpumask_setall(sched_group_mask(sg));
-                for_each_cpu(j, span) {
-                        if (get_group(j, sdd, NULL) != group)
-                                continue;
-                        cpumask_set_cpu(j, covered);
-                        cpumask_set_cpu(j, sched_group_cpus(sg));
-                }
-                if (!first)
-                        first = sg;
-                if (last)
-                        last->next = sg;
-                last = sg;
-        }
-        last->next = first;
-        return 0;
-}
-/*
- * Initialize sched groups cpu_power.
- *
- * cpu_power indicates the capacity of sched group, which is used while
- * distributing the load between different sched groups in a sched domain.
- * Typically cpu_power for all the groups in a sched domain will be same unless
- * there are asymmetries in the topology. If there are asymmetries, group
- * having more cpu_power will pickup more load compared to the group having
- * less cpu_power.
- */
-static void init_sched_groups_power(int cpu, struct sched_domain *sd)
-{
-        struct sched_group *sg = sd->groups;
-        WARN_ON(!sd || !sg);
-        do {
-                sg->group_weight = cpumask_weight(sched_group_cpus(sg));
-                sg = sg->next;
-        } while (sg != sd->groups);
-        if (cpu != group_balance_cpu(sg))
-                return;
-        update_group_power(sd, cpu);
-        atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
-}
-int __weak arch_sd_sibling_asym_packing(void)
-{
-       return 0*SD_ASYM_PACKING;
-}
-/*
- * Initializers for schedule domains
- * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
- */
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type)         sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type)         do { } while (0)
-#endif
-#define SD_INIT_FUNC(type)                                              \
-static noinline struct sched_domain *                                   \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu)         \
-{                                                                       \
-        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
-        *sd = SD_##type##_INIT;                                         \
-        SD_INIT_NAME(sd, type);                                         \
-        sd->private = &tl->data;                                        \
-        return sd;                                                      \
-}
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
-static int default_relax_domain_level = -1;
-int sched_domain_level_max;
-static int __init setup_relax_domain_level(char *str)
-{
-        if (kstrtoint(str, 0, &default_relax_domain_level))
-                pr_warn("Unable to set relax_domain_level\n");
-        return 1;
-}
-__setup("relax_domain_level=", setup_relax_domain_level);
-static void set_domain_attribute(struct sched_domain *sd,
-                                 struct sched_domain_attr *attr)
-{
-        int request;
-        if (!attr || attr->relax_domain_level < 0) {
-                if (default_relax_domain_level < 0)
-                        return;
-                else
-                        request = default_relax_domain_level;
-        } else
-                request = attr->relax_domain_level;
-        if (request < sd->level) {
-                /* turn off idle balance on this domain */
-                sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
-        } else {
-                /* turn on idle balance on this domain */
-                sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
-        }
-}
-static void __sdt_free(const struct cpumask *cpu_map);
-static int __sdt_alloc(const struct cpumask *cpu_map);
-static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
-                                 const struct cpumask *cpu_map)
-{
-        switch (what) {
-        case sa_rootdomain:
-                if (!atomic_read(&d->rd->refcount))
-                        free_rootdomain(&d->rd->rcu); /* fall through */
-        case sa_sd:
-                free_percpu(d->sd); /* fall through */
-        case sa_sd_storage:
-                __sdt_free(cpu_map); /* fall through */
-        case sa_none:
-                break;
-        }
-}
-static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
-                                                   const struct cpumask *cpu_map)
-{
-        memset(d, 0, sizeof(*d));
-        if (__sdt_alloc(cpu_map))
-                return sa_sd_storage;
-        d->sd = alloc_percpu(struct sched_domain *);
-        if (!d->sd)
-                return sa_sd_storage;
-        d->rd = alloc_rootdomain();
-        if (!d->rd)
-                return sa_sd;
-        return sa_rootdomain;
-}
-/*
- * NULL the sd_data elements we've used to build the sched_domain and
- * sched_group structure so that the subsequent __free_domain_allocs()
- * will not free the data we're using.
- */
-static void claim_allocations(int cpu, struct sched_domain *sd)
-{
-        struct sd_data *sdd = sd->private;
-        WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
-        *per_cpu_ptr(sdd->sd, cpu) = NULL;
-        if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
-                *per_cpu_ptr(sdd->sg, cpu) = NULL;
-        if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
-                *per_cpu_ptr(sdd->sgp, cpu) = NULL;
-}
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
-{
-        return topology_thread_cpumask(cpu);
-}
-#endif
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-        { sd_init_SIBLING, cpu_smt_mask, },
-#endif
-#ifdef CONFIG_SCHED_MC
-        { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
-        { sd_init_BOOK, cpu_book_mask, },
-#endif
-        { sd_init_CPU, cpu_cpu_mask, },
-        { NULL, },
-};
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
-#ifdef CONFIG_NUMA
-static int sched_domains_numa_levels;
-static int *sched_domains_numa_distance;
-static struct cpumask ***sched_domains_numa_masks;
-static int sched_domains_curr_level;
-static inline int sd_local_flags(int level)
-{
-        if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
-                return 0;
-        return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
-}
-static struct sched_domain *
-sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
-{
-        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
-        int level = tl->numa_level;
-        int sd_weight = cpumask_weight(
-                        sched_domains_numa_masks[level][cpu_to_node(cpu)]);
-        *sd = (struct sched_domain){
-                .min_interval           = sd_weight,
-                .max_interval           = 2*sd_weight,
-                .busy_factor            = 32,
-                .imbalance_pct          = 125,
-                .cache_nice_tries       = 2,
-                .busy_idx               = 3,
-                .idle_idx               = 2,
-                .newidle_idx            = 0,
-                .wake_idx               = 0,
-                .forkexec_idx           = 0,
-                .flags                  = 1*SD_LOAD_BALANCE
-                                        | 1*SD_BALANCE_NEWIDLE
-                                        | 0*SD_BALANCE_EXEC
-                                        | 0*SD_BALANCE_FORK
-                                        | 0*SD_BALANCE_WAKE
-                                        | 0*SD_WAKE_AFFINE
-                                        | 0*SD_SHARE_CPUPOWER
-                                        | 0*SD_SHARE_PKG_RESOURCES
-                                        | 1*SD_SERIALIZE
-                                        | 0*SD_PREFER_SIBLING
-                                        | sd_local_flags(level)
-                                        ,
-                .last_balance           = jiffies,
-                .balance_interval       = sd_weight,
-        };
-        SD_INIT_NAME(sd, NUMA);
-        sd->private = &tl->data;
-        /*
-         * Ugly hack to pass state to sd_numa_mask()...
-         */
-        sched_domains_curr_level = tl->numa_level;
-        return sd;
-}
-static const struct cpumask *sd_numa_mask(int cpu)
-{
-        return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
-}
-static void sched_numa_warn(const char *str)
-{
-        static int done = false;
-        int i,j;
-        if (done)
-                return;
-        done = true;
-        printk(KERN_WARNING "ERROR: %s\n\n", str);
-        for (i = 0; i < nr_node_ids; i++) {
-                printk(KERN_WARNING "  ");
-                for (j = 0; j < nr_node_ids; j++)
-                        printk(KERN_CONT "%02d ", node_distance(i,j));
-                printk(KERN_CONT "\n");
-        }
-        printk(KERN_WARNING "\n");
-}
-static bool find_numa_distance(int distance)
-{
-        int i;
-        if (distance == node_distance(0, 0))
-                return true;
-        for (i = 0; i < sched_domains_numa_levels; i++) {
-                if (sched_domains_numa_distance[i] == distance)
-                        return true;
-        }
-        return false;
-}
-static void sched_init_numa(void)
-{
-        int next_distance, curr_distance = node_distance(0, 0);
-        struct sched_domain_topology_level *tl;
-        int level = 0;
-        int i, j, k;
-        sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
-        if (!sched_domains_numa_distance)
-                return;
-        /*
-         * O(nr_nodes^2) deduplicating selection sort -- in order to find the
-         * unique distances in the node_distance() table.
-         *
-         * Assumes node_distance(0,j) includes all distances in
-         * node_distance(i,j) in order to avoid cubic time.
-         */
-        next_distance = curr_distance;
-        for (i = 0; i < nr_node_ids; i++) {
-                for (j = 0; j < nr_node_ids; j++) {
-                        for (k = 0; k < nr_node_ids; k++) {
-                                int distance = node_distance(i, k);
-                                if (distance > curr_distance &&
-                                    (distance < next_distance ||
-                                     next_distance == curr_distance))
-                                        next_distance = distance;
-                                /*
-                                 * While not a strong assumption it would be nice to know
-                                 * about cases where if node A is connected to B, B is not
-                                 * equally connected to A.
-                                 */
-                                if (sched_debug() && node_distance(k, i) != distance)
-                                        sched_numa_warn("Node-distance not symmetric");
-                                if (sched_debug() && i && !find_numa_distance(distance))
-                                        sched_numa_warn("Node-0 not representative");
-                        }
-                        if (next_distance != curr_distance) {
-                                sched_domains_numa_distance[level++] = next_distance;
-                                sched_domains_numa_levels = level;
-                                curr_distance = next_distance;
-                        } else break;
-                }
-                /*
-                 * In case of sched_debug() we verify the above assumption.
-                 */
-                if (!sched_debug())
-                        break;
-        }
-        /*
-         * 'level' contains the number of unique distances, excluding the
-         * identity distance node_distance(i,i).
-         *
-         * The sched_domains_nume_distance[] array includes the actual distance
-         * numbers.
-         */
-        /*
-         * Here, we should temporarily reset sched_domains_numa_levels to 0.
-         * If it fails to allocate memory for array sched_domains_numa_masks[][],
-         * the array will contain less then 'level' members. This could be
-         * dangerous when we use it to iterate array sched_domains_numa_masks[][]
-         * in other functions.
-         *
-         * We reset it to 'level' at the end of this function.
-         */
-        sched_domains_numa_levels = 0;
-        sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
-        if (!sched_domains_numa_masks)
-                return;
-        /*
-         * Now for each level, construct a mask per node which contains all
-         * cpus of nodes that are that many hops away from us.
-         */
-        for (i = 0; i < level; i++) {
-                sched_domains_numa_masks[i] =
-                        kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
-                if (!sched_domains_numa_masks[i])
-                        return;
-                for (j = 0; j < nr_node_ids; j++) {
-                        struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
-                        if (!mask)
-                                return;
-                        sched_domains_numa_masks[i][j] = mask;
-                        for (k = 0; k < nr_node_ids; k++) {
-                                if (node_distance(j, k) > sched_domains_numa_distance[i])
-                                        continue;
-                                cpumask_or(mask, mask, cpumask_of_node(k));
-                        }
-                }
-        }
-        tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
-                        sizeof(struct sched_domain_topology_level), GFP_KERNEL);
-        if (!tl)
-                return;
-        /*
-         * Copy the default topology bits..
-         */
-        for (i = 0; default_topology[i].init; i++)
-                tl[i] = default_topology[i];
-        /*
-         * .. and append 'j' levels of NUMA goodness.
-         */
-        for (j = 0; j < level; i++, j++) {
-                tl[i] = (struct sched_domain_topology_level){
-                        .init = sd_numa_init,
-                        .mask = sd_numa_mask,
-                        .flags = SDTL_OVERLAP,
-                        .numa_level = j,
-                };
-        }
-        sched_domain_topology = tl;
-        sched_domains_numa_levels = level;
-}
-static void sched_domains_numa_masks_set(int cpu)
-{
-        int i, j;
-        int node = cpu_to_node(cpu);
-        for (i = 0; i < sched_domains_numa_levels; i++) {
-                for (j = 0; j < nr_node_ids; j++) {
-                        if (node_distance(j, node) <= sched_domains_numa_distance[i])
-                                cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
-                }
-        }
-}
-static void sched_domains_numa_masks_clear(int cpu)
-{
-        int i, j;
-        for (i = 0; i < sched_domains_numa_levels; i++) {
-                for (j = 0; j < nr_node_ids; j++)
-                        cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
-        }
-}
-/*
- * Update sched_domains_numa_masks[level][node] array when new cpus
- * are onlined.
- */
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
-                                           unsigned long action,
-                                           void *hcpu)
-{
-        int cpu = (long)hcpu;
-        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_ONLINE:
-                sched_domains_numa_masks_set(cpu);
-                break;
-        case CPU_DEAD:
-                sched_domains_numa_masks_clear(cpu);
-                break;
-        default:
-                return NOTIFY_DONE;
-        }
-        return NOTIFY_OK;
-}
-#else
-static inline void sched_init_numa(void)
-{
-}
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
-                                           unsigned long action,
-                                           void *hcpu)
-{
-        return 0;
-}
-#endif /* CONFIG_NUMA */
-static int __sdt_alloc(const struct cpumask *cpu_map)
-{
-        struct sched_domain_topology_level *tl;
-        int j;
-        for (tl = sched_domain_topology; tl->init; tl++) {
-                struct sd_data *sdd = &tl->data;
-                sdd->sd = alloc_percpu(struct sched_domain *);
-                if (!sdd->sd)
-                        return -ENOMEM;
-                sdd->sg = alloc_percpu(struct sched_group *);
-                if (!sdd->sg)
-                        return -ENOMEM;
-                sdd->sgp = alloc_percpu(struct sched_group_power *);
-                if (!sdd->sgp)
-                        return -ENOMEM;
-                for_each_cpu(j, cpu_map) {
-                        struct sched_domain *sd;
-                        struct sched_group *sg;
-                        struct sched_group_power *sgp;
-                        sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
-                                        GFP_KERNEL, cpu_to_node(j));
-                        if (!sd)
-                                return -ENOMEM;
-                        *per_cpu_ptr(sdd->sd, j) = sd;
-                        sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                                        GFP_KERNEL, cpu_to_node(j));
-                        if (!sg)
-                                return -ENOMEM;
-                        sg->next = sg;
-                        *per_cpu_ptr(sdd->sg, j) = sg;
-                        sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
-                                        GFP_KERNEL, cpu_to_node(j));
-                        if (!sgp)
-                                return -ENOMEM;
-                        *per_cpu_ptr(sdd->sgp, j) = sgp;
-                }
-        }
-        return 0;
-}
-static void __sdt_free(const struct cpumask *cpu_map)
-{
-        struct sched_domain_topology_level *tl;
-        int j;
-        for (tl = sched_domain_topology; tl->init; tl++) {
-                struct sd_data *sdd = &tl->data;
-                for_each_cpu(j, cpu_map) {
-                        struct sched_domain *sd;
-                        if (sdd->sd) {
-                                sd = *per_cpu_ptr(sdd->sd, j);
-                                if (sd && (sd->flags & SD_OVERLAP))
-                                        free_sched_groups(sd->groups, 0);
-                                kfree(*per_cpu_ptr(sdd->sd, j));
-                        }
-                        if (sdd->sg)
-                                kfree(*per_cpu_ptr(sdd->sg, j));
-                        if (sdd->sgp)
-                                kfree(*per_cpu_ptr(sdd->sgp, j));
-                }
-                free_percpu(sdd->sd);
-                sdd->sd = NULL;
-                free_percpu(sdd->sg);
-                sdd->sg = NULL;
-                free_percpu(sdd->sgp);
-                sdd->sgp = NULL;
-        }
-}
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
-                struct s_data *d, const struct cpumask *cpu_map,
-                struct sched_domain_attr *attr, struct sched_domain *child,
-                int cpu)
-{
-        struct sched_domain *sd = tl->init(tl, cpu);
-        if (!sd)
-                return child;
-        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
-        if (child) {
-                sd->level = child->level + 1;
-                sched_domain_level_max = max(sched_domain_level_max, sd->level);
-                child->parent = sd;
-        }
-        sd->child = child;
-        set_domain_attribute(sd, attr);
-        return sd;
-}
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-static int build_sched_domains(const struct cpumask *cpu_map,
-                               struct sched_domain_attr *attr)
-{
-        enum s_alloc alloc_state = sa_none;
-        struct sched_domain *sd;
-        struct s_data d;
-        int i, ret = -ENOMEM;
-        alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
-        if (alloc_state != sa_rootdomain)
-                goto error;
-        /* Set up domains for cpus specified by the cpu_map. */
-        for_each_cpu(i, cpu_map) {
-                struct sched_domain_topology_level *tl;
-                sd = NULL;
-                for (tl = sched_domain_topology; tl->init; tl++) {
-                        sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
-                        if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
-                                sd->flags |= SD_OVERLAP;
-                        if (cpumask_equal(cpu_map, sched_domain_span(sd)))
-                                break;
-                }
-                while (sd->child)
-                        sd = sd->child;
-                *per_cpu_ptr(d.sd, i) = sd;
-        }
-        /* Build the groups for the domains */
-        for_each_cpu(i, cpu_map) {
-                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-                        sd->span_weight = cpumask_weight(sched_domain_span(sd));
-                        if (sd->flags & SD_OVERLAP) {
-                                if (build_overlap_sched_groups(sd, i))
-                                        goto error;
-                        } else {
-                                if (build_sched_groups(sd, i))
-                                        goto error;
-                        }
-                }
-        }
-        /* Calculate CPU power for physical packages and nodes */
-        for (i = nr_cpumask_bits-1; i >= 0; i--) {
-                if (!cpumask_test_cpu(i, cpu_map))
-                        continue;
-                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-                        claim_allocations(i, sd);
-                        init_sched_groups_power(i, sd);
-                }
-        }
-        /* Attach the domains */
-        rcu_read_lock();
-        for_each_cpu(i, cpu_map) {
-                sd = *per_cpu_ptr(d.sd, i);
-                cpu_attach_domain(sd, d.rd, i);
-        }
-        rcu_read_unlock();
-        ret = 0;
-error:
-        __free_domain_allocs(&d, alloc_state, cpu_map);
-        return ret;
-}
-static cpumask_var_t *doms_cur; /* current sched domains */
-static int ndoms_cur;           /* number of sched domains in 'doms_cur' */
-static struct sched_domain_attr *dattr_cur;
-                                /* attribues of custom domains in 'doms_cur' */
-/*
- * Special case: If a kmalloc of a doms_cur partition (array of
- * cpumask) fails, then fallback to a single sched domain,
- * as determined by the single cpumask fallback_doms.
- */
-static cpumask_var_t fallback_doms;
-/*
- * arch_update_cpu_topology lets virtualized architectures update the
- * cpu core maps. It is supposed to return 1 if the topology changed
- * or 0 if it stayed the same.
- */
-int __attribute__((weak)) arch_update_cpu_topology(void)
-{
-        return 0;
-}
-cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
-{
-        int i;
-        cpumask_var_t *doms;
-        doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
-        if (!doms)
-                return NULL;
-        for (i = 0; i < ndoms; i++) {
-                if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
-                        free_sched_domains(doms, i);
-                        return NULL;
-                }
-        }
-        return doms;
-}
-void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
-{
-        unsigned int i;
-        for (i = 0; i < ndoms; i++)
-                free_cpumask_var(doms[i]);
-        kfree(doms);
-}
-/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated cpus, but could be used to
- * exclude other special cases in the future.
- */
-static int init_sched_domains(const struct cpumask *cpu_map)
-{
-        int err;
-        arch_update_cpu_topology();
-        ndoms_cur = 1;
-        doms_cur = alloc_sched_domains(ndoms_cur);
-        if (!doms_cur)
-                doms_cur = &fallback_doms;
-        cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
-        err = build_sched_domains(doms_cur[0], NULL);
-        register_sched_domain_sysctl();
-        return err;
-}
-/*
- * Detach sched domains from a group of cpus specified in cpu_map
- * These cpus will now be attached to the NULL domain
- */
-static void detach_destroy_domains(const struct cpumask *cpu_map)
-{
-        int i;
-        rcu_read_lock();
-        for_each_cpu(i, cpu_map)
-                cpu_attach_domain(NULL, &def_root_domain, i);
-        rcu_read_unlock();
-}
-/* handle null as "default" */
-static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
-                        struct sched_domain_attr *new, int idx_new)
-{
-        struct sched_domain_attr tmp;
-        /* fast path */
-        if (!new && !cur)
-                return 1;
-        tmp = SD_ATTR_INIT;
-        return !memcmp(cur ? (cur + idx_cur) : &tmp,
-                        new ? (new + idx_new) : &tmp,
-                        sizeof(struct sched_domain_attr));
-}
-/*
- * Partition sched domains as specified by the 'ndoms_new'
- * cpumasks in the array doms_new[] of cpumasks. This compares
- * doms_new[] to the current sched domain partitioning, doms_cur[].
- * It destroys each deleted domain and builds each new domain.
- *
- * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
- * The masks don't intersect (don't overlap.) We should setup one
- * sched domain for each mask. CPUs not in any of the cpumasks will
- * not be load balanced. If the same cpumask appears both in the
- * current 'doms_cur' domains and in the new 'doms_new', we can leave
- * it as it is.
- *
- * The passed in 'doms_new' should be allocated using
- * alloc_sched_domains.  This routine takes ownership of it and will
- * free_sched_domains it when done with it. If the caller failed the
- * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
- *
- * If doms_new == NULL it will be replaced with cpu_online_mask.
- * ndoms_new == 0 is a special case for destroying existing domains,
- * and it will not create the default domain.
- *
- * Call with hotplug lock held
- */
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-                             struct sched_domain_attr *dattr_new)
-{
-        int i, j, n;
-        int new_topology;
-        mutex_lock(&sched_domains_mutex);
-        /* always unregister in case we don't destroy any domains */
-        unregister_sched_domain_sysctl();
-        /* Let architecture update cpu core mappings. */
-        new_topology = arch_update_cpu_topology();
-        n = doms_new ? ndoms_new : 0;
-        /* Destroy deleted domains */
-        for (i = 0; i < ndoms_cur; i++) {
-                for (j = 0; j < n && !new_topology; j++) {
-                        if (cpumask_equal(doms_cur[i], doms_new[j])
-                            && dattrs_equal(dattr_cur, i, dattr_new, j))
-                                goto match1;
-                }
-                /* no match - a current sched domain not in new doms_new[] */
-                detach_destroy_domains(doms_cur[i]);
-match1:
-                ;
-        }
-        if (doms_new == NULL) {
-                ndoms_cur = 0;
-                doms_new = &fallback_doms;
-                cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
-                WARN_ON_ONCE(dattr_new);
-        }
-        /* Build new domains */
-        for (i = 0; i < ndoms_new; i++) {
-                for (j = 0; j < ndoms_cur && !new_topology; j++) {
-                        if (cpumask_equal(doms_new[i], doms_cur[j])
-                            && dattrs_equal(dattr_new, i, dattr_cur, j))
-                                goto match2;
-                }
-                /* no match - add a new doms_new */
-                build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
-match2:
-                ;
-        }
-        /* Remember the new sched domains */
-        if (doms_cur != &fallback_doms)
-                free_sched_domains(doms_cur, ndoms_cur);
-        kfree(dattr_cur);       /* kfree(NULL) is safe */
-        doms_cur = doms_new;
-        dattr_cur = dattr_new;
-        ndoms_cur = ndoms_new;
-        register_sched_domain_sysctl();
-        mutex_unlock(&sched_domains_mutex);
-}
-static int num_cpus_frozen;     /* used to mark begin/end of suspend/resume */
-/*
- * Update cpusets according to cpu_active mask.  If cpusets are
- * disabled, cpuset_update_active_cpus() becomes a simple wrapper
- * around partition_sched_domains().
- *
- * If we come here as part of a suspend/resume, don't touch cpusets because we
- * want to restore it back to its original state upon resume anyway.
- */
-static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
-                             void *hcpu)
-{
-        switch (action) {
-        case CPU_ONLINE_FROZEN:
-        case CPU_DOWN_FAILED_FROZEN:
-                /*
-                 * num_cpus_frozen tracks how many CPUs are involved in suspend
-                 * resume sequence. As long as this is not the last online
-                 * operation in the resume sequence, just build a single sched
-                 * domain, ignoring cpusets.
-                 */
-                num_cpus_frozen--;
-                if (likely(num_cpus_frozen)) {
-                        partition_sched_domains(1, NULL, NULL);
-                        break;
-                }
-                /*
-                 * This is the last CPU online operation. So fall through and
-                 * restore the original sched domains by considering the
-                 * cpuset configurations.
-                 */
-        case CPU_ONLINE:
-        case CPU_DOWN_FAILED:
-                cpuset_update_active_cpus(true);
-                break;
-        default:
-                return NOTIFY_DONE;
-        }
-        return NOTIFY_OK;
-}
-static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
-                               void *hcpu)
-{
-        switch (action) {
-        case CPU_DOWN_PREPARE:
-                cpuset_update_active_cpus(false);
-                break;
-        case CPU_DOWN_PREPARE_FROZEN:
-                num_cpus_frozen++;
-                partition_sched_domains(1, NULL, NULL);
-                break;
-        default:
-                return NOTIFY_DONE;
-        }
-        return NOTIFY_OK;
-}
-void __init sched_init_smp(void)
-{
-        cpumask_var_t non_isolated_cpus;
-        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-        sched_init_numa();
-        get_online_cpus();
-        mutex_lock(&sched_domains_mutex);
-        init_sched_domains(cpu_active_mask);
-        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-        if (cpumask_empty(non_isolated_cpus))
-                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
-        mutex_unlock(&sched_domains_mutex);
-        put_online_cpus();
-        hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
-        hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
-        hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-        /* RT runtime code needs to handle some hotplug events */
-        hotcpu_notifier(update_runtime, 0);
-        init_hrtick();
-        /* Move init over to a non-isolated CPU */
-        if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
-                BUG();
-        sched_init_granularity();
-        free_cpumask_var(non_isolated_cpus);
-        init_sched_rt_class();
-}
-#else
-void __init sched_init_smp(void)
-{
-        sched_init_granularity();
-}
-#endif /* CONFIG_SMP */
-const_debug unsigned int sysctl_timer_migration = 1;
-int in_sched_functions(unsigned long addr)
-{
-        return in_lock_functions(addr) ||
-                (addr >= (unsigned long)__sched_text_start
-                && addr < (unsigned long)__sched_text_end);
-}
-#ifdef CONFIG_CGROUP_SCHED
-struct task_group root_task_group;
-LIST_HEAD(task_groups);
-#endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-void __init sched_init(void)
-{
-        int i, j;
-        unsigned long alloc_size = 0, ptr;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        alloc_size += 2 * nr_cpu_ids * sizeof(void **);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        alloc_size += 2 * nr_cpu_ids * sizeof(void **);
-#endif
-#ifdef CONFIG_CPUMASK_OFFSTACK
-        alloc_size += num_possible_cpus() * cpumask_size();
-#endif
-        if (alloc_size) {
-                ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-                root_task_group.se = (struct sched_entity **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
-                root_task_group.rt_se = (struct sched_rt_entity **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-                root_task_group.rt_rq = (struct rt_rq **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_CPUMASK_OFFSTACK
-                for_each_possible_cpu(i) {
-                        per_cpu(load_balance_tmpmask, i) = (void *)ptr;
-                        ptr += cpumask_size();
-                }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
-        }
-#ifdef CONFIG_SMP
-        init_defrootdomain();
-#endif
-        init_rt_bandwidth(&def_rt_bandwidth,
-                        global_rt_period(), global_rt_runtime());
-#ifdef CONFIG_RT_GROUP_SCHED
-        init_rt_bandwidth(&root_task_group.rt_bandwidth,
-                        global_rt_period(), global_rt_runtime());
-#endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_CGROUP_SCHED
-        list_add(&root_task_group.list, &task_groups);
-        INIT_LIST_HEAD(&root_task_group.children);
-        INIT_LIST_HEAD(&root_task_group.siblings);
-        autogroup_init(&init_task);
-#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
-        root_cpuacct.cpustat = &kernel_cpustat;
-        root_cpuacct.cpuusage = alloc_percpu(u64);
-        /* Too early, not expected to fail */
-        BUG_ON(!root_cpuacct.cpuusage);
-#endif
-        for_each_possible_cpu(i) {
-                struct rq *rq;
-                rq = cpu_rq(i);
-                raw_spin_lock_init(&rq->lock);
-                rq->nr_running = 0;
-                rq->calc_load_active = 0;
-                rq->calc_load_update = jiffies + LOAD_FREQ;
-                init_cfs_rq(&rq->cfs);
-                init_rt_rq(&rq->rt, rq);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-                root_task_group.shares = ROOT_TASK_GROUP_LOAD;
-                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-                /*
-                 * How much cpu bandwidth does root_task_group get?
-                 *
-                 * In case of task-groups formed thr' the cgroup filesystem, it
-                 * gets 100% of the cpu resources in the system. This overall
-                 * system cpu resource is divided among the tasks of
-                 * root_task_group and its child task-groups in a fair manner,
-                 * based on each entity's (task or task-group's) weight
-                 * (se->load.weight).
-                 *
-                 * In other words, if root_task_group has 10 tasks of weight
-                 * 1024) and two child groups A0 and A1 (of weight 1024 each),
-                 * then A0's share of the cpu resource is:
-                 *
-                 *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
-                 *
-                 * We achieve this by letting root_task_group's tasks sit
-                 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
-                 */
-                init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
-                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-                rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
-#ifdef CONFIG_RT_GROUP_SCHED
-                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
-#endif
-                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
-                        rq->cpu_load[j] = 0;
-                rq->last_load_update_tick = jiffies;
-#ifdef CONFIG_SMP
-                rq->sd = NULL;
-                rq->rd = NULL;
-                rq->cpu_power = SCHED_POWER_SCALE;
-                rq->post_schedule = 0;
-                rq->active_balance = 0;
-                rq->next_balance = jiffies;
-                rq->push_cpu = 0;
-                rq->cpu = i;
-                rq->online = 0;
-                rq->idle_stamp = 0;
-                rq->avg_idle = 2*sysctl_sched_migration_cost;
-                INIT_LIST_HEAD(&rq->cfs_tasks);
-                rq_attach_root(rq, &def_root_domain);
-#ifdef CONFIG_NO_HZ
-                rq->nohz_flags = 0;
-#endif
-#endif
-                init_rq_hrtick(rq);
-                atomic_set(&rq->nr_iowait, 0);
-        }
-        set_load_weight(&init_task);
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-        INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
-#ifdef CONFIG_RT_MUTEXES
-        plist_head_init(&init_task.pi_waiters);
-#endif
-        /*
-         * The boot idle thread does lazy MMU switching as well:
-         */
-        atomic_inc(&init_mm.mm_count);
-        enter_lazy_tlb(&init_mm, current);
-        /*
-         * Make us the idle thread. Technically, schedule() should not be
-         * called from this thread, however somewhere below it might be,
-         * but because we are the idle thread, we just pick up running again
-         * when this runqueue becomes "idle".
-         */
-        init_idle(current, smp_processor_id());
-        calc_load_update = jiffies + LOAD_FREQ;
-        /*
-         * During early bootup we pretend to be a normal task:
-         */
-        current->sched_class = &fair_sched_class;
-#ifdef CONFIG_SMP
-        zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
-        /* May be allocated at isolcpus cmdline parse time */
-        if (cpu_isolated_map == NULL)
-                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
-        idle_thread_set_boot_cpu();
-#endif
-        init_sched_fair_class();
-        scheduler_running = 1;
-}
-#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-static inline int preempt_count_equals(int preempt_offset)
-{
-        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
-        return (nested == preempt_offset);
-}
-void __might_sleep(const char *file, int line, int preempt_offset)
-{
-        static unsigned long prev_jiffy;        /* ratelimiting */
-        rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
-        if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
-            system_state != SYSTEM_RUNNING || oops_in_progress)
-                return;
-        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-                return;
-        prev_jiffy = jiffies;
-        printk(KERN_ERR
-                "BUG: sleeping function called from invalid context at %s:%d\n",
-                        file, line);
-        printk(KERN_ERR
-                "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
-                        in_atomic(), irqs_disabled(),
-                        current->pid, current->comm);
-        debug_show_held_locks(current);
-        if (irqs_disabled())
-                print_irqtrace_events(current);
-        dump_stack();
-}
-EXPORT_SYMBOL(__might_sleep);
-#endif
-#ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
-{
-        const struct sched_class *prev_class = p->sched_class;
-        int old_prio = p->prio;
-        int on_rq;
-        on_rq = p->on_rq;
-        if (on_rq)
-                dequeue_task(rq, p, 0);
-        __setscheduler(rq, p, SCHED_NORMAL, 0);
-        if (on_rq) {
-                enqueue_task(rq, p, 0);
-                resched_task(rq->curr);
-        }
-        check_class_changed(rq, p, prev_class, old_prio);
-}
-void normalize_rt_tasks(void)
-{
-        struct task_struct *g, *p;
-        unsigned long flags;
-        struct rq *rq;
-        read_lock_irqsave(&tasklist_lock, flags);
-        do_each_thread(g, p) {
-                /*
-                 * Only normalize user tasks:
-                 */
-                if (!p->mm)
-                        continue;
-                p->se.exec_start                = 0;
-#ifdef CONFIG_SCHEDSTATS
-                p->se.statistics.wait_start     = 0;
-                p->se.statistics.sleep_start    = 0;
-                p->se.statistics.block_start    = 0;
-#endif
-                if (!rt_task(p)) {
-                        /*
-                         * Renice negative nice level userspace
-                         * tasks back to 0:
-                         */
-                        if (TASK_NICE(p) < 0 && p->mm)
-                                set_user_nice(p, 0);
-                        continue;
-                }
-                raw_spin_lock(&p->pi_lock);
-                rq = __task_rq_lock(p);
-                normalize_task(rq, p);
-                __task_rq_unlock(rq);
-                raw_spin_unlock(&p->pi_lock);
-        } while_each_thread(g, p);
-        read_unlock_irqrestore(&tasklist_lock, flags);
-}
-#endif /* CONFIG_MAGIC_SYSRQ */
-#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
-/*
- * These functions are only useful for the IA64 MCA handling, or kdb.
- *
- * They can only be called when the whole system has been
- * stopped - every CPU needs to be quiescent, and no scheduling
- * activity can take place. Using them for anything else would
- * be a serious bug, and as a result, they aren't even visible
- * under any other configuration.
- */
-/**
- * curr_task - return the current task for a given cpu.
- * @cpu: the processor in question.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-struct task_struct *curr_task(int cpu)
-{
-        return cpu_curr(cpu);
-}
-#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
-#ifdef CONFIG_IA64
-/**
- * set_curr_task - set the current task for a given cpu.
- * @cpu: the processor in question.
- * @p: the task pointer to set.
- *
- * Description: This function must only be used when non-maskable interrupts
- * are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a cpu in a non-blocking manner. This function
- * must be called with all CPU's synchronized, and interrupts disabled, the
- * and caller must save the original value of the current task (see
- * curr_task() above) and restore that value before reenabling interrupts and
- * re-starting the system.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-void set_curr_task(int cpu, struct task_struct *p)
-{
-        cpu_curr(cpu) = p;
-}
-#endif
-#ifdef CONFIG_CGROUP_SCHED
-/* task_group_lock serializes the addition/removal of task groups */
-static DEFINE_SPINLOCK(task_group_lock);
-static void free_sched_group(struct task_group *tg)
-{
-        free_fair_sched_group(tg);
-        free_rt_sched_group(tg);
-        autogroup_free(tg);
-        kfree(tg);
-}
-/* allocate runqueue etc for a new task group */
-struct task_group *sched_create_group(struct task_group *parent)
-{
-        struct task_group *tg;
-        unsigned long flags;
-        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
-        if (!tg)
-                return ERR_PTR(-ENOMEM);
-        if (!alloc_fair_sched_group(tg, parent))
-                goto err;
-        if (!alloc_rt_sched_group(tg, parent))
-                goto err;
-        spin_lock_irqsave(&task_group_lock, flags);
-        list_add_rcu(&tg->list, &task_groups);
-        WARN_ON(!parent); /* root should already exist */
-        tg->parent = parent;
-        INIT_LIST_HEAD(&tg->children);
-        list_add_rcu(&tg->siblings, &parent->children);
-        spin_unlock_irqrestore(&task_group_lock, flags);
-        return tg;
-err:
-        free_sched_group(tg);
-        return ERR_PTR(-ENOMEM);
-}
-/* rcu callback to free various structures associated with a task group */
-static void free_sched_group_rcu(struct rcu_head *rhp)
-{
-        /* now it should be safe to free those cfs_rqs */
-        free_sched_group(container_of(rhp, struct task_group, rcu));
-}
-/* Destroy runqueue etc associated with a task group */
-void sched_destroy_group(struct task_group *tg)
-{
-        unsigned long flags;
-        int i;
-        /* end participation in shares distribution */
-        for_each_possible_cpu(i)
-                unregister_fair_sched_group(tg, i);
-        spin_lock_irqsave(&task_group_lock, flags);
-        list_del_rcu(&tg->list);
-        list_del_rcu(&tg->siblings);
-        spin_unlock_irqrestore(&task_group_lock, flags);
-        /* wait for possible concurrent references to cfs_rqs complete */
-        call_rcu(&tg->rcu, free_sched_group_rcu);
-}
-/* change task's runqueue when it moves between groups.
- *      The caller of this function should have put the task in its new group
- *      by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- *      reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
-{
-        struct task_group *tg;
-        int on_rq, running;
-        unsigned long flags;
-        struct rq *rq;
-        rq = task_rq_lock(tsk, &flags);
-        running = task_current(rq, tsk);
-        on_rq = tsk->on_rq;
-        if (on_rq)
-                dequeue_task(rq, tsk, 0);
-        if (unlikely(running))
-                tsk->sched_class->put_prev_task(rq, tsk);
-        tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
-                                lockdep_is_held(&tsk->sighand->siglock)),
-                          struct task_group, css);
-        tg = autogroup_task_group(tsk, tg);
-        tsk->sched_task_group = tg;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        if (tsk->sched_class->task_move_group)
-                tsk->sched_class->task_move_group(tsk, on_rq);
-        else
-#endif
-                set_task_rq(tsk, task_cpu(tsk));
-        if (unlikely(running))
-                tsk->sched_class->set_curr_task(rq);
-        if (on_rq)
-                enqueue_task(rq, tsk, 0);
-        task_rq_unlock(rq, tsk, &flags);
-}
-#endif /* CONFIG_CGROUP_SCHED */
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-static unsigned long to_ratio(u64 period, u64 runtime)
-{
-        if (runtime == RUNTIME_INF)
-                return 1ULL << 20;
-        return div64_u64(runtime << 20, period);
-}
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
-/* Must be called with tasklist_lock held */
-static inline int tg_has_rt_tasks(struct task_group *tg)
-{
-        struct task_struct *g, *p;
-        do_each_thread(g, p) {
-                if (rt_task(p) && task_rq(p)->rt.tg == tg)
-                        return 1;
-        } while_each_thread(g, p);
-        return 0;
-}
-struct rt_schedulable_data {
-        struct task_group *tg;
-        u64 rt_period;
-        u64 rt_runtime;
-};
-static int tg_rt_schedulable(struct task_group *tg, void *data)
-{
-        struct rt_schedulable_data *d = data;
-        struct task_group *child;
-        unsigned long total, sum = 0;
-        u64 period, runtime;
-        period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-        runtime = tg->rt_bandwidth.rt_runtime;
-        if (tg == d->tg) {
-                period = d->rt_period;
-                runtime = d->rt_runtime;
-        }
-        /*
-         * Cannot have more runtime than the period.
-         */
-        if (runtime > period && runtime != RUNTIME_INF)
-                return -EINVAL;
-        /*
-         * Ensure we don't starve existing RT tasks.
-         */
-        if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
-                return -EBUSY;
-        total = to_ratio(period, runtime);
-        /*
-         * Nobody can have more than the global setting allows.
-         */
-        if (total > to_ratio(global_rt_period(), global_rt_runtime()))
-                return -EINVAL;
-        /*
-         * The sum of our children's runtime should not exceed our own.
-         */
-        list_for_each_entry_rcu(child, &tg->children, siblings) {
-                period = ktime_to_ns(child->rt_bandwidth.rt_period);
-                runtime = child->rt_bandwidth.rt_runtime;
-                if (child == d->tg) {
-                        period = d->rt_period;
-                        runtime = d->rt_runtime;
-                }
-                sum += to_ratio(period, runtime);
-        }
-        if (sum > total)
-                return -EINVAL;
-        return 0;
-}
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-{
-        int ret;
-        struct rt_schedulable_data data = {
-                .tg = tg,
-                .rt_period = period,
-                .rt_runtime = runtime,
-        };
-        rcu_read_lock();
-        ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
-        rcu_read_unlock();
-        return ret;
-}
-static int tg_set_rt_bandwidth(struct task_group *tg,
-                u64 rt_period, u64 rt_runtime)
-{
-        int i, err = 0;
-        mutex_lock(&rt_constraints_mutex);
-        read_lock(&tasklist_lock);
-        err = __rt_schedulable(tg, rt_period, rt_runtime);
-        if (err)
-                goto unlock;
-        raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-        tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
-        tg->rt_bandwidth.rt_runtime = rt_runtime;
-        for_each_possible_cpu(i) {
-                struct rt_rq *rt_rq = tg->rt_rq[i];
-                raw_spin_lock(&rt_rq->rt_runtime_lock);
-                rt_rq->rt_runtime = rt_runtime;
-                raw_spin_unlock(&rt_rq->rt_runtime_lock);
-        }
-        raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-unlock:
-        read_unlock(&tasklist_lock);
-        mutex_unlock(&rt_constraints_mutex);
-        return err;
-}
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
-{
-        u64 rt_runtime, rt_period;
-        rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-        rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
-        if (rt_runtime_us < 0)
-                rt_runtime = RUNTIME_INF;
-        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
-}
-long sched_group_rt_runtime(struct task_group *tg)
-{
-        u64 rt_runtime_us;
-        if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
-                return -1;
-        rt_runtime_us = tg->rt_bandwidth.rt_runtime;
-        do_div(rt_runtime_us, NSEC_PER_USEC);
-        return rt_runtime_us;
-}
-int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
-{
-        u64 rt_runtime, rt_period;
-        rt_period = (u64)rt_period_us * NSEC_PER_USEC;
-        rt_runtime = tg->rt_bandwidth.rt_runtime;
-        if (rt_period == 0)
-                return -EINVAL;
-        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
-}
-long sched_group_rt_period(struct task_group *tg)
-{
-        u64 rt_period_us;
-        rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
-        do_div(rt_period_us, NSEC_PER_USEC);
-        return rt_period_us;
-}
-static int sched_rt_global_constraints(void)
-{
-        u64 runtime, period;
-        int ret = 0;
-        if (sysctl_sched_rt_period <= 0)
-                return -EINVAL;
-        runtime = global_rt_runtime();
-        period = global_rt_period();
-        /*
-         * Sanity check on the sysctl variables.
-         */
-        if (runtime > period && runtime != RUNTIME_INF)
-                return -EINVAL;
-        mutex_lock(&rt_constraints_mutex);
-        read_lock(&tasklist_lock);
-        ret = __rt_schedulable(NULL, 0, 0);
-        read_unlock(&tasklist_lock);
-        mutex_unlock(&rt_constraints_mutex);
-        return ret;
-}
-int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
-{
-        /* Don't accept realtime tasks when there is no way for them to run */
-        if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
-                return 0;
-        return 1;
-}
-#else /* !CONFIG_RT_GROUP_SCHED */
-static int sched_rt_global_constraints(void)
-{
-        unsigned long flags;
-        int i;
-        if (sysctl_sched_rt_period <= 0)
-                return -EINVAL;
-        /*
-         * There's always some RT tasks in the root group
-         * -- migration, kstopmachine etc..
-         */
-        if (sysctl_sched_rt_runtime == 0)
-                return -EBUSY;
-        raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
-        for_each_possible_cpu(i) {
-                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-                raw_spin_lock(&rt_rq->rt_runtime_lock);
-                rt_rq->rt_runtime = global_rt_runtime();
-                raw_spin_unlock(&rt_rq->rt_runtime_lock);
-        }
-        raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-        return 0;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-int sched_rt_handler(struct ctl_table *table, int write,
-                void __user *buffer, size_t *lenp,
-                loff_t *ppos)
-{
-        int ret;
-        int old_period, old_runtime;
-        static DEFINE_MUTEX(mutex);
-        mutex_lock(&mutex);
-        old_period = sysctl_sched_rt_period;
-        old_runtime = sysctl_sched_rt_runtime;
-        ret = proc_dointvec(table, write, buffer, lenp, ppos);
-        if (!ret && write) {
-                ret = sched_rt_global_constraints();
-                if (ret) {
-                        sysctl_sched_rt_period = old_period;
-                        sysctl_sched_rt_runtime = old_runtime;
-                } else {
-                        def_rt_bandwidth.rt_runtime = global_rt_runtime();
-                        def_rt_bandwidth.rt_period =
-                                ns_to_ktime(global_rt_period());
-                }
-        }
-        mutex_unlock(&mutex);
-        return ret;
-}
-#ifdef CONFIG_CGROUP_SCHED
-/* return corresponding task_group object of a cgroup */
-static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
-{
-        return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
-                            struct task_group, css);
-}
-static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
-{
-        struct task_group *tg, *parent;
-        if (!cgrp->parent) {
-                /* This is early initialization for the top cgroup */
-                return &root_task_group.css;
-        }
-        parent = cgroup_tg(cgrp->parent);
-        tg = sched_create_group(parent);
-        if (IS_ERR(tg))
-                return ERR_PTR(-ENOMEM);
-        return &tg->css;
-}
-static void cpu_cgroup_css_free(struct cgroup *cgrp)
-{
-        struct task_group *tg = cgroup_tg(cgrp);
-        sched_destroy_group(tg);
-}
-static int cpu_cgroup_can_attach(struct cgroup *cgrp,
-                                 struct cgroup_taskset *tset)
-{
-        struct task_struct *task;
-        cgroup_taskset_for_each(task, cgrp, tset) {
-#ifdef CONFIG_RT_GROUP_SCHED
-                if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
-                        return -EINVAL;
-#else
-                /* We don't support RT-tasks being in separate groups */
-                if (task->sched_class != &fair_sched_class)
-                        return -EINVAL;
-#endif
-        }
-        return 0;
-}
-static void cpu_cgroup_attach(struct cgroup *cgrp,
-                              struct cgroup_taskset *tset)
-{
-        struct task_struct *task;
-        cgroup_taskset_for_each(task, cgrp, tset)
-                sched_move_task(task);
-}
-static void
-cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
-                struct task_struct *task)
-{
-        /*
-         * cgroup_exit() is called in the copy_process() failure path.
-         * Ignore this case since the task hasn't ran yet, this avoids
-         * trying to poke a half freed task state from generic code.
-         */
-        if (!(task->flags & PF_EXITING))
-                return;
-        sched_move_task(task);
-}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
-                                u64 shareval)
-{
-        return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
-}
-static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
-{
-        struct task_group *tg = cgroup_tg(cgrp);
-        return (u64) scale_load_down(tg->shares);
-}
-#ifdef CONFIG_CFS_BANDWIDTH
-static DEFINE_MUTEX(cfs_constraints_mutex);
-const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
-static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
-{
-        int i, ret = 0, runtime_enabled, runtime_was_enabled;
-        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
-        if (tg == &root_task_group)
-                return -EINVAL;
-        /*
-         * Ensure we have at some amount of bandwidth every period.  This is
-         * to prevent reaching a state of large arrears when throttled via
-         * entity_tick() resulting in prolonged exit starvation.
-         */
-        if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
-                return -EINVAL;
-        /*
-         * Likewise, bound things on the otherside by preventing insane quota
-         * periods.  This also allows us to normalize in computing quota
-         * feasibility.
-         */
-        if (period > max_cfs_quota_period)
-                return -EINVAL;
-        mutex_lock(&cfs_constraints_mutex);
-        ret = __cfs_schedulable(tg, period, quota);
-        if (ret)
-                goto out_unlock;
-        runtime_enabled = quota != RUNTIME_INF;
-        runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
-        account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
-        raw_spin_lock_irq(&cfs_b->lock);
-        cfs_b->period = ns_to_ktime(period);
-        cfs_b->quota = quota;
-        __refill_cfs_bandwidth_runtime(cfs_b);
-        /* restart the period timer (if active) to handle new period expiry */
-        if (runtime_enabled && cfs_b->timer_active) {
-                /* force a reprogram */
-                cfs_b->timer_active = 0;
-                __start_cfs_bandwidth(cfs_b);
-        }
-        raw_spin_unlock_irq(&cfs_b->lock);
-        for_each_possible_cpu(i) {
-                struct cfs_rq *cfs_rq = tg->cfs_rq[i];
-                struct rq *rq = cfs_rq->rq;
-                raw_spin_lock_irq(&rq->lock);
-                cfs_rq->runtime_enabled = runtime_enabled;
-                cfs_rq->runtime_remaining = 0;
-                if (cfs_rq->throttled)
-                        unthrottle_cfs_rq(cfs_rq);
-                raw_spin_unlock_irq(&rq->lock);
-        }
-out_unlock:
-        mutex_unlock(&cfs_constraints_mutex);
-        return ret;
-}
-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
-{
-        u64 quota, period;
-        period = ktime_to_ns(tg->cfs_bandwidth.period);
-        if (cfs_quota_us < 0)
-                quota = RUNTIME_INF;
-        else
-                quota = (u64)cfs_quota_us * NSEC_PER_USEC;
-        return tg_set_cfs_bandwidth(tg, period, quota);
-}
-long tg_get_cfs_quota(struct task_group *tg)
-{
-        u64 quota_us;
-        if (tg->cfs_bandwidth.quota == RUNTIME_INF)
-                return -1;
-        quota_us = tg->cfs_bandwidth.quota;
-        do_div(quota_us, NSEC_PER_USEC);
-        return quota_us;
-}
-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
-{
-        u64 quota, period;
-        period = (u64)cfs_period_us * NSEC_PER_USEC;
-        quota = tg->cfs_bandwidth.quota;
-        return tg_set_cfs_bandwidth(tg, period, quota);
-}
-long tg_get_cfs_period(struct task_group *tg)
-{
-        u64 cfs_period_us;
-        cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
-        do_div(cfs_period_us, NSEC_PER_USEC);
-        return cfs_period_us;
-}
-static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
-{
-        return tg_get_cfs_quota(cgroup_tg(cgrp));
-}
-static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
-                                s64 cfs_quota_us)
-{
-        return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
-}
-static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
-{
-        return tg_get_cfs_period(cgroup_tg(cgrp));
-}
-static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
-                                u64 cfs_period_us)
-{
-        return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
-}
-struct cfs_schedulable_data {
-        struct task_group *tg;
-        u64 period, quota;
-};
-/*
- * normalize group quota/period to be quota/max_period
- * note: units are usecs
- */
-static u64 normalize_cfs_quota(struct task_group *tg,
-                               struct cfs_schedulable_data *d)
-{
-        u64 quota, period;
-        if (tg == d->tg) {
-                period = d->period;
-                quota = d->quota;
-        } else {
-                period = tg_get_cfs_period(tg);
-                quota = tg_get_cfs_quota(tg);
-        }
-        /* note: these should typically be equivalent */
-        if (quota == RUNTIME_INF || quota == -1)
-                return RUNTIME_INF;
-        return to_ratio(period, quota);
-}
-static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
-{
-        struct cfs_schedulable_data *d = data;
-        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
-        s64 quota = 0, parent_quota = -1;
-        if (!tg->parent) {
-                quota = RUNTIME_INF;
-        } else {
-                struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
-                quota = normalize_cfs_quota(tg, d);
-                parent_quota = parent_b->hierarchal_quota;
-                /*
-                 * ensure max(child_quota) <= parent_quota, inherit when no
-                 * limit is set
-                 */
-                if (quota == RUNTIME_INF)
-                        quota = parent_quota;
-                else if (parent_quota != RUNTIME_INF && quota > parent_quota)
-                        return -EINVAL;
-        }
-        cfs_b->hierarchal_quota = quota;
-        return 0;
-}
-static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
-{
-        int ret;
-        struct cfs_schedulable_data data = {
-                .tg = tg,
-                .period = period,
-                .quota = quota,
-        };
-        if (quota != RUNTIME_INF) {
-                do_div(data.period, NSEC_PER_USEC);
-                do_div(data.quota, NSEC_PER_USEC);
-        }
-        rcu_read_lock();
-        ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
-        rcu_read_unlock();
-        return ret;
-}
-static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
-                struct cgroup_map_cb *cb)
-{
-        struct task_group *tg = cgroup_tg(cgrp);
-        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
-        cb->fill(cb, "nr_periods", cfs_b->nr_periods);
-        cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
-        cb->fill(cb, "throttled_time", cfs_b->throttled_time);
-        return 0;
-}
-#endif /* CONFIG_CFS_BANDWIDTH */
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
-static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
-                                s64 val)
-{
-        return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
-}
-static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
-{
-        return sched_group_rt_runtime(cgroup_tg(cgrp));
-}
-static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
-                u64 rt_period_us)
-{
-        return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
-}
-static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
-{
-        return sched_group_rt_period(cgroup_tg(cgrp));
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-static struct cftype cpu_files[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        {
-                .name = "shares",
-                .read_u64 = cpu_shares_read_u64,
-                .write_u64 = cpu_shares_write_u64,
-        },
-#endif
-#ifdef CONFIG_CFS_BANDWIDTH
-        {
-                .name = "cfs_quota_us",
-                .read_s64 = cpu_cfs_quota_read_s64,
-                .write_s64 = cpu_cfs_quota_write_s64,
-        },
-        {
-                .name = "cfs_period_us",
-                .read_u64 = cpu_cfs_period_read_u64,
-                .write_u64 = cpu_cfs_period_write_u64,
-        },
-        {
-                .name = "stat",
-                .read_map = cpu_stats_show,
-        },
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        {
-                .name = "rt_runtime_us",
-                .read_s64 = cpu_rt_runtime_read,
-                .write_s64 = cpu_rt_runtime_write,
-        },
-        {
-                .name = "rt_period_us",
-                .read_u64 = cpu_rt_period_read_uint,
-                .write_u64 = cpu_rt_period_write_uint,
-        },
-#endif
-        { }     /* terminate */
-};
-struct cgroup_subsys cpu_cgroup_subsys = {
-        .name           = "cpu",
-        .css_alloc      = cpu_cgroup_css_alloc,
-        .css_free       = cpu_cgroup_css_free,
-        .can_attach     = cpu_cgroup_can_attach,
-        .attach         = cpu_cgroup_attach,
-        .exit           = cpu_cgroup_exit,
-        .subsys_id      = cpu_cgroup_subsys_id,
-        .base_cftypes   = cpu_files,
-        .early_init     = 1,
-};
-#endif  /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
-/*
- * CPU accounting code for task groups.
- *
- * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
- * (balbir@in.ibm.com).
- */
-struct cpuacct root_cpuacct;
-/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
-{
-        struct cpuacct *ca;
-        if (!cgrp->parent)
-                return &root_cpuacct.css;
-        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-        if (!ca)
-                goto out;
-        ca->cpuusage = alloc_percpu(u64);
-        if (!ca->cpuusage)
-                goto out_free_ca;
-        ca->cpustat = alloc_percpu(struct kernel_cpustat);
-        if (!ca->cpustat)
-                goto out_free_cpuusage;
-        return &ca->css;
-out_free_cpuusage:
-        free_percpu(ca->cpuusage);
-out_free_ca:
-        kfree(ca);
-out:
-        return ERR_PTR(-ENOMEM);
-}
-/* destroy an existing cpu accounting group */
-static void cpuacct_css_free(struct cgroup *cgrp)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        free_percpu(ca->cpustat);
-        free_percpu(ca->cpuusage);
-        kfree(ca);
-}
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
-{
-        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-        u64 data;
-#ifndef CONFIG_64BIT
-        /*
-         * Take rq->lock to make 64-bit read safe on 32-bit platforms.
-         */
-        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-        data = *cpuusage;
-        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-        data = *cpuusage;
-#endif
-        return data;
-}
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
-{
-        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-#ifndef CONFIG_64BIT
-        /*
-         * Take rq->lock to make 64-bit write safe on 32-bit platforms.
-         */
-        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-        *cpuusage = val;
-        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-        *cpuusage = val;
-#endif
-}
-/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        u64 totalcpuusage = 0;
-        int i;
-        for_each_present_cpu(i)
-                totalcpuusage += cpuacct_cpuusage_read(ca, i);
-        return totalcpuusage;
-}
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
-                                                                u64 reset)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        int err = 0;
-        int i;
-        if (reset) {
-                err = -EINVAL;
-                goto out;
-        }
-        for_each_present_cpu(i)
-                cpuacct_cpuusage_write(ca, i, 0);
-out:
-        return err;
-}
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
-                                   struct seq_file *m)
-{
-        struct cpuacct *ca = cgroup_ca(cgroup);
-        u64 percpu;
-        int i;
-        for_each_present_cpu(i) {
-                percpu = cpuacct_cpuusage_read(ca, i);
-                seq_printf(m, "%llu ", (unsigned long long) percpu);
-        }
-        seq_printf(m, "\n");
-        return 0;
-}
-static const char *cpuacct_stat_desc[] = {
-        [CPUACCT_STAT_USER] = "user",
-        [CPUACCT_STAT_SYSTEM] = "system",
-};
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
-                              struct cgroup_map_cb *cb)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        int cpu;
-        s64 val = 0;
-        for_each_online_cpu(cpu) {
-                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-                val += kcpustat->cpustat[CPUTIME_USER];
-                val += kcpustat->cpustat[CPUTIME_NICE];
-        }
-        val = cputime64_to_clock_t(val);
-        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-        val = 0;
-        for_each_online_cpu(cpu) {
-                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-                val += kcpustat->cpustat[CPUTIME_SYSTEM];
-                val += kcpustat->cpustat[CPUTIME_IRQ];
-                val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
-        }
-        val = cputime64_to_clock_t(val);
-        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
-        return 0;
-}
-static struct cftype files[] = {
-        {
-                .name = "usage",
-                .read_u64 = cpuusage_read,
-                .write_u64 = cpuusage_write,
-        },
-        {
-                .name = "usage_percpu",
-                .read_seq_string = cpuacct_percpu_seq_read,
-        },
-        {
-                .name = "stat",
-                .read_map = cpuacct_stats_show,
-        },
-        { }     /* terminate */
-};
-/*
- * charge this task's execution time to its accounting group.
- *
- * called with rq->lock held.
- */
-void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-        struct cpuacct *ca;
-        int cpu;
-        if (unlikely(!cpuacct_subsys.active))
-                return;
-        cpu = task_cpu(tsk);
-        rcu_read_lock();
-        ca = task_ca(tsk);
-        for (; ca; ca = parent_ca(ca)) {
-                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-                *cpuusage += cputime;
-        }
-        rcu_read_unlock();
-}
-struct cgroup_subsys cpuacct_subsys = {
-        .name = "cpuacct",
-        .css_alloc = cpuacct_css_alloc,
-        .css_free = cpuacct_css_free,
-        .subsys_id = cpuacct_subsys_id,
-        .base_cftypes = files,
-};
-#endif  /* CONFIG_CGROUP_CPUACCT */
-void dump_cpu_task(int cpu)
-{
-        pr_info("Task dump for CPU %d:\n", cpu);
-        sched_show_task(cpu_curr(cpu));
-}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
deleted file mode 100644
index 23aa789c53e..00000000000
--- a/kernel/sched/cpupri.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- *  kernel/sched/cpupri.c
- *
- *  CPU priority management
- *
- *  Copyright (C) 2007-2008 Novell
- *
- *  Author: Gregory Haskins <ghaskins@novell.com>
- *
- *  This code tracks the priority of each CPU so that global migration
- *  decisions are easy to calculate.  Each CPU can be in a state as follows:
- *
- *                 (INVALID), IDLE, NORMAL, RT1, ... RT99
- *
- *  going from the lowest priority to the highest.  CPUs in the INVALID state
- *  are not eligible for routing.  The system maintains this state with
- *  a 2 dimensional bitmap (the first for priority class, the second for cpus
- *  in that class).  Therefore a typical application without affinity
- *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
- *  searches).  For tasks with affinity restrictions, the algorithm has a
- *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that
- *  yields the worst case search is fairly contrived.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; version 2
- *  of the License.
- */
-#include <linux/gfp.h>
-#include "cpupri.h"
-/* Convert between a 140 based task->prio, and our 102 based cpupri */
-static int convert_prio(int prio)
-{
-        int cpupri;
-        if (prio == CPUPRI_INVALID)
-                cpupri = CPUPRI_INVALID;
-        else if (prio == MAX_PRIO)
-                cpupri = CPUPRI_IDLE;
-        else if (prio >= MAX_RT_PRIO)
-                cpupri = CPUPRI_NORMAL;
-        else
-                cpupri = MAX_RT_PRIO - prio + 1;
-        return cpupri;
-}
-/**
- * cpupri_find - find the best (lowest-pri) CPU in the system
- * @cp: The cpupri context
- * @p: The task
- * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
- *
- * Note: This function returns the recommended CPUs as calculated during the
- * current invocation.  By the time the call returns, the CPUs may have in
- * fact changed priorities any number of times.  While not ideal, it is not
- * an issue of correctness since the normal rebalancer logic will correct
- * any discrepancies created by racing against the uncertainty of the current
- * priority configuration.
- *
- * Returns: (int)bool - CPUs were found
- */
-int cpupri_find(struct cpupri *cp, struct task_struct *p,
-                struct cpumask *lowest_mask)
-{
-        int idx = 0;
-        int task_pri = convert_prio(p->prio);
-        if (task_pri >= MAX_RT_PRIO)
-                return 0;
-        for (idx = 0; idx < task_pri; idx++) {
-                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
-                int skip = 0;
-                if (!atomic_read(&(vec)->count))
-                        skip = 1;
-                /*
-                 * When looking at the vector, we need to read the counter,
-                 * do a memory barrier, then read the mask.
-                 *
-                 * Note: This is still all racey, but we can deal with it.
-                 *  Ideally, we only want to look at masks that are set.
-                 *
-                 *  If a mask is not set, then the only thing wrong is that we
-                 *  did a little more work than necessary.
-                 *
-                 *  If we read a zero count but the mask is set, because of the
-                 *  memory barriers, that can only happen when the highest prio
-                 *  task for a run queue has left the run queue, in which case,
-                 *  it will be followed by a pull. If the task we are processing
-                 *  fails to find a proper place to go, that pull request will
-                 *  pull this task if the run queue is running at a lower
-                 *  priority.
-                 */
-                smp_rmb();
-                /* Need to do the rmb for every iteration */
-                if (skip)
-                        continue;
-                if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
-                        continue;
-                if (lowest_mask) {
-                        cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
-                        /*
-                         * We have to ensure that we have at least one bit
-                         * still set in the array, since the map could have
-                         * been concurrently emptied between the first and
-                         * second reads of vec->mask.  If we hit this
-                         * condition, simply act as though we never hit this
-                         * priority level and continue on.
-                         */
-                        if (cpumask_any(lowest_mask) >= nr_cpu_ids)
-                                continue;
-                }
-                return 1;
-        }
-        return 0;
-}
-/**
- * cpupri_set - update the cpu priority setting
- * @cp: The cpupri context
- * @cpu: The target cpu
- * @newpri: The priority (INVALID-RT99) to assign to this CPU
- *
- * Note: Assumes cpu_rq(cpu)->lock is locked
- *
- * Returns: (void)
- */
-void cpupri_set(struct cpupri *cp, int cpu, int newpri)
-{
-        int *currpri = &cp->cpu_to_pri[cpu];
-        int oldpri = *currpri;
-        int do_mb = 0;
-        newpri = convert_prio(newpri);
-        BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
-        if (newpri == oldpri)
-                return;
-        /*
-         * If the cpu was currently mapped to a different value, we
-         * need to map it to the new value then remove the old value.
-         * Note, we must add the new value first, otherwise we risk the
-         * cpu being missed by the priority loop in cpupri_find.
-         */
-        if (likely(newpri != CPUPRI_INVALID)) {
-                struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
-                cpumask_set_cpu(cpu, vec->mask);
-                /*
-                 * When adding a new vector, we update the mask first,
-                 * do a write memory barrier, and then update the count, to
-                 * make sure the vector is visible when count is set.
-                 */
-                smp_mb__before_atomic_inc();
-                atomic_inc(&(vec)->count);
-                do_mb = 1;
-        }
-        if (likely(oldpri != CPUPRI_INVALID)) {
-                struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
-                /*
-                 * Because the order of modification of the vec->count
-                 * is important, we must make sure that the update
-                 * of the new prio is seen before we decrement the
-                 * old prio. This makes sure that the loop sees
-                 * one or the other when we raise the priority of
-                 * the run queue. We don't care about when we lower the
-                 * priority, as that will trigger an rt pull anyway.
-                 *
-                 * We only need to do a memory barrier if we updated
-                 * the new priority vec.
-                 */
-                if (do_mb)
-                        smp_mb__after_atomic_inc();
-                /*
-                 * When removing from the vector, we decrement the counter first
-                 * do a memory barrier and then clear the mask.
-                 */
-                atomic_dec(&(vec)->count);
-                smp_mb__after_atomic_inc();
-                cpumask_clear_cpu(cpu, vec->mask);
-        }
-        *currpri = newpri;
-}
-/**
- * cpupri_init - initialize the cpupri structure
- * @cp: The cpupri context
- *
- * Returns: -ENOMEM if memory fails.
- */
-int cpupri_init(struct cpupri *cp)
-{
-        int i;
-        memset(cp, 0, sizeof(*cp));
-        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
-                struct cpupri_vec *vec = &cp->pri_to_cpu[i];
-                atomic_set(&vec->count, 0);
-                if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
-                        goto cleanup;
-        }
-        for_each_possible_cpu(i)
-                cp->cpu_to_pri[i] = CPUPRI_INVALID;
-        return 0;
-cleanup:
-        for (i--; i >= 0; i--)
-                free_cpumask_var(cp->pri_to_cpu[i].mask);
-        return -ENOMEM;
-}
-/**
- * cpupri_cleanup - clean up the cpupri structure
- * @cp: The cpupri context
- */
-void cpupri_cleanup(struct cpupri *cp)
-{
-        int i;
-        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
-                free_cpumask_var(cp->pri_to_cpu[i].mask);
-}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
deleted file mode 100644
index f6d75617349..00000000000
--- a/kernel/sched/cpupri.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef _LINUX_CPUPRI_H
-#define _LINUX_CPUPRI_H
-#include <linux/sched.h>
-#define CPUPRI_NR_PRIORITIES    (MAX_RT_PRIO + 2)
-#define CPUPRI_INVALID -1
-#define CPUPRI_IDLE     0
-#define CPUPRI_NORMAL   1
-/* values 2-101 are RT priorities 0-99 */
-struct cpupri_vec {
-        atomic_t        count;
-        cpumask_var_t   mask;
-};
-struct cpupri {
-        struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
-        int               cpu_to_pri[NR_CPUS];
-};
-#ifdef CONFIG_SMP
-int  cpupri_find(struct cpupri *cp,
-                 struct task_struct *p, struct cpumask *lowest_mask);
-void cpupri_set(struct cpupri *cp, int cpu, int pri);
-int cpupri_init(struct cpupri *cp);
-void cpupri_cleanup(struct cpupri *cp);
-#else
-#define cpupri_set(cp, cpu, pri) do { } while (0)
-#define cpupri_init() do { } while (0)
-#endif
-#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
deleted file mode 100644
index 293b202fcf7..00000000000
--- a/kernel/sched/cputime.c
+++ /dev/null
@@ -1,589 +0,0 @@
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/tsacct_kern.h>
-#include <linux/kernel_stat.h>
-#include <linux/static_key.h>
-#include "sched.h"
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-/*
- * There are no locks covering percpu hardirq/softirq time.
- * They are only modified in vtime_account, on corresponding CPU
- * with interrupts disabled. So, writes are safe.
- * They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
- * race with irq/vtime_account on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
- */
-DEFINE_PER_CPU(u64, cpu_hardirq_time);
-DEFINE_PER_CPU(u64, cpu_softirq_time);
-static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;
-void enable_sched_clock_irqtime(void)
-{
-        sched_clock_irqtime = 1;
-}
-void disable_sched_clock_irqtime(void)
-{
-        sched_clock_irqtime = 0;
-}
-#ifndef CONFIG_64BIT
-DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-#endif /* CONFIG_64BIT */
-/*
- * Called before incrementing preempt_count on {soft,}irq_enter
- * and before decrementing preempt_count on {soft,}irq_exit.
- */
-void irqtime_account_irq(struct task_struct *curr)
-{
-        unsigned long flags;
-        s64 delta;
-        int cpu;
-        if (!sched_clock_irqtime)
-                return;
-        local_irq_save(flags);
-        cpu = smp_processor_id();
-        delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
-        __this_cpu_add(irq_start_time, delta);
-        irq_time_write_begin();
-        /*
-         * We do not account for softirq time from ksoftirqd here.
-         * We want to continue accounting softirq time to ksoftirqd thread
-         * in that case, so as not to confuse scheduler with a special task
-         * that do not consume any time, but still wants to run.
-         */
-        if (hardirq_count())
-                __this_cpu_add(cpu_hardirq_time, delta);
-        else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
-                __this_cpu_add(cpu_softirq_time, delta);
-        irq_time_write_end();
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(irqtime_account_irq);
-static int irqtime_account_hi_update(void)
-{
-        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        unsigned long flags;
-        u64 latest_ns;
-        int ret = 0;
-        local_irq_save(flags);
-        latest_ns = this_cpu_read(cpu_hardirq_time);
-        if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
-                ret = 1;
-        local_irq_restore(flags);
-        return ret;
-}
-static int irqtime_account_si_update(void)
-{
-        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        unsigned long flags;
-        u64 latest_ns;
-        int ret = 0;
-        local_irq_save(flags);
-        latest_ns = this_cpu_read(cpu_softirq_time);
-        if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
-                ret = 1;
-        local_irq_restore(flags);
-        return ret;
-}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-#define sched_clock_irqtime     (0)
-#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
-static inline void task_group_account_field(struct task_struct *p, int index,
-                                            u64 tmp)
-{
-#ifdef CONFIG_CGROUP_CPUACCT
-        struct kernel_cpustat *kcpustat;
-        struct cpuacct *ca;
-#endif
-        /*
-         * Since all updates are sure to touch the root cgroup, we
-         * get ourselves ahead and touch it first. If the root cgroup
-         * is the only cgroup, then nothing else should be necessary.
-         *
-         */
-        __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
-#ifdef CONFIG_CGROUP_CPUACCT
-        if (unlikely(!cpuacct_subsys.active))
-                return;
-        rcu_read_lock();
-        ca = task_ca(p);
-        while (ca && (ca != &root_cpuacct)) {
-                kcpustat = this_cpu_ptr(ca->cpustat);
-                kcpustat->cpustat[index] += tmp;
-                ca = parent_ca(ca);
-        }
-        rcu_read_unlock();
-#endif
-}
-/*
- * Account user cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-void account_user_time(struct task_struct *p, cputime_t cputime,
-                       cputime_t cputime_scaled)
-{
-        int index;
-        /* Add user time to process. */
-        p->utime += cputime;
-        p->utimescaled += cputime_scaled;
-        account_group_user_time(p, cputime);
-        index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
-        /* Add user time to cpustat. */
-        task_group_account_field(p, index, (__force u64) cputime);
-        /* Account for user time used */
-        acct_update_integrals(p);
-}
-/*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
-                               cputime_t cputime_scaled)
-{
-        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        /* Add guest time to process. */
-        p->utime += cputime;
-        p->utimescaled += cputime_scaled;
-        account_group_user_time(p, cputime);
-        p->gtime += cputime;
-        /* Add guest time to cpustat. */
-        if (TASK_NICE(p) > 0) {
-                cpustat[CPUTIME_NICE] += (__force u64) cputime;
-                cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
-        } else {
-                cpustat[CPUTIME_USER] += (__force u64) cputime;
-                cpustat[CPUTIME_GUEST] += (__force u64) cputime;
-        }
-}
-/*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
- */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
-                        cputime_t cputime_scaled, int index)
-{
-        /* Add system time to process. */
-        p->stime += cputime;
-        p->stimescaled += cputime_scaled;
-        account_group_system_time(p, cputime);
-        /* Add system time to cpustat. */
-        task_group_account_field(p, index, (__force u64) cputime);
-        /* Account for system time used */
-        acct_update_integrals(p);
-}
-/*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-void account_system_time(struct task_struct *p, int hardirq_offset,
-                         cputime_t cputime, cputime_t cputime_scaled)
-{
-        int index;
-        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
-                account_guest_time(p, cputime, cputime_scaled);
-                return;
-        }
-        if (hardirq_count() - hardirq_offset)
-                index = CPUTIME_IRQ;
-        else if (in_serving_softirq())
-                index = CPUTIME_SOFTIRQ;
-        else
-                index = CPUTIME_SYSTEM;
-        __account_system_time(p, cputime, cputime_scaled, index);
-}
-/*
- * Account for involuntary wait time.
- * @cputime: the cpu time spent in involuntary wait
- */
-void account_steal_time(cputime_t cputime)
-{
-        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        cpustat[CPUTIME_STEAL] += (__force u64) cputime;
-}
-/*
- * Account for idle time.
- * @cputime: the cpu time spent in idle wait
- */
-void account_idle_time(cputime_t cputime)
-{
-        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        struct rq *rq = this_rq();
-        if (atomic_read(&rq->nr_iowait) > 0)
-                cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
-        else
-                cpustat[CPUTIME_IDLE] += (__force u64) cputime;
-}
-static __always_inline bool steal_account_process_tick(void)
-{
-#ifdef CONFIG_PARAVIRT
-        if (static_key_false(&paravirt_steal_enabled)) {
-                u64 steal, st = 0;
-                steal = paravirt_steal_clock(smp_processor_id());
-                steal -= this_rq()->prev_steal_time;
-                st = steal_ticks(steal);
-                this_rq()->prev_steal_time += st * TICK_NSEC;
-                account_steal_time(st);
-                return st;
-        }
-#endif
-        return false;
-}
-/*
- * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
- * tasks (sum on group iteration) belonging to @tsk's group.
- */
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
-{
-        struct signal_struct *sig = tsk->signal;
-        struct task_struct *t;
-        times->utime = sig->utime;
-        times->stime = sig->stime;
-        times->sum_exec_runtime = sig->sum_sched_runtime;
-        rcu_read_lock();
-        /* make sure we can trust tsk->thread_group list */
-        if (!likely(pid_alive(tsk)))
-                goto out;
-        t = tsk;
-        do {
-                times->utime += t->utime;
-                times->stime += t->stime;
-                times->sum_exec_runtime += task_sched_runtime(t);
-        } while_each_thread(tsk, t);
-out:
-        rcu_read_unlock();
-}
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-/*
- * Account a tick to a process and cpustat
- * @p: the process that the cpu time gets accounted to
- * @user_tick: is the tick from userspace
- * @rq: the pointer to rq
- *
- * Tick demultiplexing follows the order
- * - pending hardirq update
- * - pending softirq update
- * - user_time
- * - idle_time
- * - system time
- *   - check for guest_time
- *   - else account as system_time
- *
- * Check for hardirq is done both for system and user time as there is
- * no timer going off while we are on hardirq and hence we may never get an
- * opportunity to update it solely in system time.
- * p->stime and friends are only updated on system time and not on irq
- * softirq as those do not count in task exec_runtime any more.
- */
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                                struct rq *rq)
-{
-        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        if (steal_account_process_tick())
-                return;
-        if (irqtime_account_hi_update()) {
-                cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
-        } else if (irqtime_account_si_update()) {
-                cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
-        } else if (this_cpu_ksoftirqd() == p) {
-                /*
-                 * ksoftirqd time do not get accounted in cpu_softirq_time.
-                 * So, we have to handle it separately here.
-                 * Also, p->stime needs to be updated for ksoftirqd.
-                 */
-                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                        CPUTIME_SOFTIRQ);
-        } else if (user_tick) {
-                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
-        } else if (p == rq->idle) {
-                account_idle_time(cputime_one_jiffy);
-        } else if (p->flags & PF_VCPU) { /* System time or guest time */
-                account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
-        } else {
-                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                        CPUTIME_SYSTEM);
-        }
-}
-static void irqtime_account_idle_ticks(int ticks)
-{
-        int i;
-        struct rq *rq = this_rq();
-        for (i = 0; i < ticks; i++)
-                irqtime_account_process_tick(current, 0, rq);
-}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static void irqtime_account_idle_ticks(int ticks) {}
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                                struct rq *rq) {}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-/*
- * Account a single tick of cpu time.
- * @p: the process that the cpu time gets accounted to
- * @user_tick: indicates if the tick is a user or a system tick
- */
-void account_process_tick(struct task_struct *p, int user_tick)
-{
-        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-        struct rq *rq = this_rq();
-        if (sched_clock_irqtime) {
-                irqtime_account_process_tick(p, user_tick, rq);
-                return;
-        }
-        if (steal_account_process_tick())
-                return;
-        if (user_tick)
-                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
-        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
-                account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
-                                    one_jiffy_scaled);
-        else
-                account_idle_time(cputime_one_jiffy);
-}
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
-        account_steal_time(jiffies_to_cputime(ticks));
-}
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
-        if (sched_clock_irqtime) {
-                irqtime_account_idle_ticks(ticks);
-                return;
-        }
-        account_idle_time(jiffies_to_cputime(ticks));
-}
-#endif
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-        *ut = p->utime;
-        *st = p->stime;
-}
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-        struct task_cputime cputime;
-        thread_group_cputime(p, &cputime);
-        *ut = cputime.utime;
-        *st = cputime.stime;
-}
-void vtime_account_system_irqsafe(struct task_struct *tsk)
-{
-        unsigned long flags;
-        local_irq_save(flags);
-        vtime_account_system(tsk);
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
-{
-        if (is_idle_task(prev))
-                vtime_account_idle(prev);
-        else
-                vtime_account_system(prev);
-        vtime_account_user(prev);
-        arch_vtime_task_switch(prev);
-}
-#endif
-/*
- * Archs that account the whole time spent in the idle task
- * (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
- * have other meaning of the idle time (s390 only includes the
- * time spent by the CPU when it's in low power mode) must override
- * vtime_account().
- */
-#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account(struct task_struct *tsk)
-{
-        if (in_interrupt() || !is_idle_task(tsk))
-                vtime_account_system(tsk);
-        else
-                vtime_account_idle(tsk);
-}
-EXPORT_SYMBOL_GPL(vtime_account);
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#else
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs)      nsecs_to_jiffies(__nsecs)
-#endif
-static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
-{
-        u64 temp = (__force u64) rtime;
-        temp *= (__force u64) utime;
-        if (sizeof(cputime_t) == 4)
-                temp = div_u64(temp, (__force u32) total);
-        else
-                temp = div64_u64(temp, (__force u64) total);
-        return (__force cputime_t) temp;
-}
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
- */
-static void cputime_adjust(struct task_cputime *curr,
-                           struct cputime *prev,
-                           cputime_t *ut, cputime_t *st)
-{
-        cputime_t rtime, utime, total;
-        utime = curr->utime;
-        total = utime + curr->stime;
-        /*
-         * Tick based cputime accounting depend on random scheduling
-         * timeslices of a task to be interrupted or not by the timer.
-         * Depending on these circumstances, the number of these interrupts
-         * may be over or under-optimistic, matching the real user and system
-         * cputime with a variable precision.
-         *
-         * Fix this by scaling these tick based values against the total
-         * runtime accounted by the CFS scheduler.
-         */
-        rtime = nsecs_to_cputime(curr->sum_exec_runtime);
-        if (total)
-                utime = scale_utime(utime, rtime, total);
-        else
-                utime = rtime;
-        /*
-         * If the tick based count grows faster than the scheduler one,
-         * the result of the scaling may go backward.
-         * Let's enforce monotonicity.
-         */
-        prev->utime = max(prev->utime, utime);
-        prev->stime = max(prev->stime, rtime - prev->utime);
-        *ut = prev->utime;
-        *st = prev->stime;
-}
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-        struct task_cputime cputime = {
-                .utime = p->utime,
-                .stime = p->stime,
-                .sum_exec_runtime = p->se.sum_exec_runtime,
-        };
-        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
-}
-/*
- * Must be called with siglock held.
- */
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-        struct task_cputime cputime;
-        thread_group_cputime(p, &cputime);
-        cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
-}
-#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
deleted file mode 100644
index 2cd3c1b4e58..00000000000
--- a/kernel/sched/debug.c
+++ /dev/null
@@ -1,531 +0,0 @@
-/*
- * kernel/sched/debug.c
- *
- * Print the CFS rbtree
- *
- * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/proc_fs.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-#include <linux/utsname.h>
-#include "sched.h"
-static DEFINE_SPINLOCK(sched_debug_lock);
-/*
- * This allows printing both to /proc/sched_debug and
- * to the console
- */
-#define SEQ_printf(m, x...)                     \
- do {                                           \
-        if (m)                                  \
-                seq_printf(m, x);               \
-        else                                    \
-                printk(x);                      \
- } while (0)
-/*
- * Ease the printing of nsec fields:
- */
-static long long nsec_high(unsigned long long nsec)
-{
-        if ((long long)nsec < 0) {
-                nsec = -nsec;
-                do_div(nsec, 1000000);
-                return -nsec;
-        }
-        do_div(nsec, 1000000);
-        return nsec;
-}
-static unsigned long nsec_low(unsigned long long nsec)
-{
-        if ((long long)nsec < 0)
-                nsec = -nsec;
-        return do_div(nsec, 1000000);
-}
-#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
-{
-        struct sched_entity *se = tg->se[cpu];
-#define P(F) \
-        SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
-#define PN(F) \
-        SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
-        if (!se) {
-                struct sched_avg *avg = &cpu_rq(cpu)->avg;
-                P(avg->runnable_avg_sum);
-                P(avg->runnable_avg_period);
-                return;
-        }
-        PN(se->exec_start);
-        PN(se->vruntime);
-        PN(se->sum_exec_runtime);
-#ifdef CONFIG_SCHEDSTATS
-        PN(se->statistics.wait_start);
-        PN(se->statistics.sleep_start);
-        PN(se->statistics.block_start);
-        PN(se->statistics.sleep_max);
-        PN(se->statistics.block_max);
-        PN(se->statistics.exec_max);
-        PN(se->statistics.slice_max);
-        PN(se->statistics.wait_max);
-        PN(se->statistics.wait_sum);
-        P(se->statistics.wait_count);
-#endif
-        P(se->load.weight);
-#ifdef CONFIG_SMP
-        P(se->avg.runnable_avg_sum);
-        P(se->avg.runnable_avg_period);
-        P(se->avg.load_avg_contrib);
-        P(se->avg.decay_count);
-#endif
-#undef PN
-#undef P
-}
-#endif
-#ifdef CONFIG_CGROUP_SCHED
-static char group_path[PATH_MAX];
-static char *task_group_path(struct task_group *tg)
-{
-        if (autogroup_path(tg, group_path, PATH_MAX))
-                return group_path;
-        /*
-         * May be NULL if the underlying cgroup isn't fully-created yet
-         */
-        if (!tg->css.cgroup) {
-                group_path[0] = '\0';
-                return group_path;
-        }
-        cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
-        return group_path;
-}
-#endif
-static void
-print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
-{
-        if (rq->curr == p)
-                SEQ_printf(m, "R");
-        else
-                SEQ_printf(m, " ");
-        SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
-                p->comm, p->pid,
-                SPLIT_NS(p->se.vruntime),
-                (long long)(p->nvcsw + p->nivcsw),
-                p->prio);
-#ifdef CONFIG_SCHEDSTATS
-        SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-                SPLIT_NS(p->se.vruntime),
-                SPLIT_NS(p->se.sum_exec_runtime),
-                SPLIT_NS(p->se.statistics.sum_sleep_runtime));
-#else
-        SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
-                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
-#endif
-#ifdef CONFIG_CGROUP_SCHED
-        SEQ_printf(m, " %s", task_group_path(task_group(p)));
-#endif
-        SEQ_printf(m, "\n");
-}
-static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
-{
-        struct task_struct *g, *p;
-        unsigned long flags;
-        SEQ_printf(m,
-        "\nrunnable tasks:\n"
-        "            task   PID         tree-key  switches  prio"
-        "     exec-runtime         sum-exec        sum-sleep\n"
-        "------------------------------------------------------"
-        "----------------------------------------------------\n");
-        read_lock_irqsave(&tasklist_lock, flags);
-        do_each_thread(g, p) {
-                if (!p->on_rq || task_cpu(p) != rq_cpu)
-                        continue;
-                print_task(m, rq, p);
-        } while_each_thread(g, p);
-        read_unlock_irqrestore(&tasklist_lock, flags);
-}
-void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
-{
-        s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
-                spread, rq0_min_vruntime, spread0;
-        struct rq *rq = cpu_rq(cpu);
-        struct sched_entity *last;
-        unsigned long flags;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
-#else
-        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
-#endif
-        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
-                        SPLIT_NS(cfs_rq->exec_clock));
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        if (cfs_rq->rb_leftmost)
-                MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
-        last = __pick_last_entity(cfs_rq);
-        if (last)
-                max_vruntime = last->vruntime;
-        min_vruntime = cfs_rq->min_vruntime;
-        rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
-                        SPLIT_NS(MIN_vruntime));
-        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
-                        SPLIT_NS(min_vruntime));
-        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "max_vruntime",
-                        SPLIT_NS(max_vruntime));
-        spread = max_vruntime - MIN_vruntime;
-        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread",
-                        SPLIT_NS(spread));
-        spread0 = min_vruntime - rq0_min_vruntime;
-        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
-                        SPLIT_NS(spread0));
-        SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
-                        cfs_rq->nr_spread_over);
-        SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
-        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
-        SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
-                        cfs_rq->runnable_load_avg);
-        SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
-                        cfs_rq->blocked_load_avg);
-        SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg",
-                        atomic64_read(&cfs_rq->tg->load_avg));
-        SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib",
-                        cfs_rq->tg_load_contrib);
-        SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib",
-                        cfs_rq->tg_runnable_contrib);
-        SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",
-                        atomic_read(&cfs_rq->tg->runnable_avg));
-#endif
-        print_cfs_group_stats(m, cpu, cfs_rq->tg);
-#endif
-}
-void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
-{
-#ifdef CONFIG_RT_GROUP_SCHED
-        SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
-#else
-        SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
-#endif
-#define P(x) \
-        SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
-#define PN(x) \
-        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
-        P(rt_nr_running);
-        P(rt_throttled);
-        PN(rt_time);
-        PN(rt_runtime);
-#undef PN
-#undef P
-}
-extern __read_mostly int sched_clock_running;
-static void print_cpu(struct seq_file *m, int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long flags;
-#ifdef CONFIG_X86
-        {
-                unsigned int freq = cpu_khz ? : 1;
-                SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
-                           cpu, freq / 1000, (freq % 1000));
-        }
-#else
-        SEQ_printf(m, "\ncpu#%d\n", cpu);
-#endif
-#define P(x)                                                            \
-do {                                                                    \
-        if (sizeof(rq->x) == 4)                                         \
-                SEQ_printf(m, "  .%-30s: %ld\n", #x, (long)(rq->x));    \
-        else                                                            \
-                SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x));\
-} while (0)
-#define PN(x) \
-        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
-        P(nr_running);
-        SEQ_printf(m, "  .%-30s: %lu\n", "load",
-                   rq->load.weight);
-        P(nr_switches);
-        P(nr_load_updates);
-        P(nr_uninterruptible);
-        PN(next_balance);
-        P(curr->pid);
-        PN(clock);
-        P(cpu_load[0]);
-        P(cpu_load[1]);
-        P(cpu_load[2]);
-        P(cpu_load[3]);
-        P(cpu_load[4]);
-#undef P
-#undef PN
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
-        P(yld_count);
-        P(sched_count);
-        P(sched_goidle);
-#ifdef CONFIG_SMP
-        P64(avg_idle);
-#endif
-        P(ttwu_count);
-        P(ttwu_local);
-#undef P
-#undef P64
-#endif
-        spin_lock_irqsave(&sched_debug_lock, flags);
-        print_cfs_stats(m, cpu);
-        print_rt_stats(m, cpu);
-        rcu_read_lock();
-        print_rq(m, rq, cpu);
-        rcu_read_unlock();
-        spin_unlock_irqrestore(&sched_debug_lock, flags);
-}
-static const char *sched_tunable_scaling_names[] = {
-        "none",
-        "logaritmic",
-        "linear"
-};
-static int sched_debug_show(struct seq_file *m, void *v)
-{
-        u64 ktime, sched_clk, cpu_clk;
-        unsigned long flags;
-        int cpu;
-        local_irq_save(flags);
-        ktime = ktime_to_ns(ktime_get());
-        sched_clk = sched_clock();
-        cpu_clk = local_clock();
-        local_irq_restore(flags);
-        SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
-                init_utsname()->release,
-                (int)strcspn(init_utsname()->version, " "),
-                init_utsname()->version);
-#define P(x) \
-        SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
-#define PN(x) \
-        SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
-        PN(ktime);
-        PN(sched_clk);
-        PN(cpu_clk);
-        P(jiffies);
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-        P(sched_clock_stable);
-#endif
-#undef PN
-#undef P
-        SEQ_printf(m, "\n");
-        SEQ_printf(m, "sysctl_sched\n");
-#define P(x) \
-        SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
-#define PN(x) \
-        SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
-        PN(sysctl_sched_latency);
-        PN(sysctl_sched_min_granularity);
-        PN(sysctl_sched_wakeup_granularity);
-        P(sysctl_sched_child_runs_first);
-        P(sysctl_sched_features);
-#undef PN
-#undef P
-        SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
-                sysctl_sched_tunable_scaling,
-                sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
-        for_each_online_cpu(cpu)
-                print_cpu(m, cpu);
-        SEQ_printf(m, "\n");
-        return 0;
-}
-void sysrq_sched_debug_show(void)
-{
-        sched_debug_show(NULL, NULL);
-}
-static int sched_debug_open(struct inode *inode, struct file *filp)
-{
-        return single_open(filp, sched_debug_show, NULL);
-}
-static const struct file_operations sched_debug_fops = {
-        .open           = sched_debug_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-static int __init init_sched_debug_procfs(void)
-{
-        struct proc_dir_entry *pe;
-        pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
-        if (!pe)
-                return -ENOMEM;
-        return 0;
-}
-__initcall(init_sched_debug_procfs);
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
-{
-        unsigned long nr_switches;
-        SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
-                                                get_nr_threads(p));
-        SEQ_printf(m,
-                "---------------------------------------------------------\n");
-#define __P(F) \
-        SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
-#define P(F) \
-        SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
-#define __PN(F) \
-        SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN(F) \
-        SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
-        PN(se.exec_start);
-        PN(se.vruntime);
-        PN(se.sum_exec_runtime);
-        nr_switches = p->nvcsw + p->nivcsw;
-#ifdef CONFIG_SCHEDSTATS
-        PN(se.statistics.wait_start);
-        PN(se.statistics.sleep_start);
-        PN(se.statistics.block_start);
-        PN(se.statistics.sleep_max);
-        PN(se.statistics.block_max);
-        PN(se.statistics.exec_max);
-        PN(se.statistics.slice_max);
-        PN(se.statistics.wait_max);
-        PN(se.statistics.wait_sum);
-        P(se.statistics.wait_count);
-        PN(se.statistics.iowait_sum);
-        P(se.statistics.iowait_count);
-        P(se.nr_migrations);
-        P(se.statistics.nr_migrations_cold);
-        P(se.statistics.nr_failed_migrations_affine);
-        P(se.statistics.nr_failed_migrations_running);
-        P(se.statistics.nr_failed_migrations_hot);
-        P(se.statistics.nr_forced_migrations);
-        P(se.statistics.nr_wakeups);
-        P(se.statistics.nr_wakeups_sync);
-        P(se.statistics.nr_wakeups_migrate);
-        P(se.statistics.nr_wakeups_local);
-        P(se.statistics.nr_wakeups_remote);
-        P(se.statistics.nr_wakeups_affine);
-        P(se.statistics.nr_wakeups_affine_attempts);
-        P(se.statistics.nr_wakeups_passive);
-        P(se.statistics.nr_wakeups_idle);
-        {
-                u64 avg_atom, avg_per_cpu;
-                avg_atom = p->se.sum_exec_runtime;
-                if (nr_switches)
-                        do_div(avg_atom, nr_switches);
-                else
-                        avg_atom = -1LL;
-                avg_per_cpu = p->se.sum_exec_runtime;
-                if (p->se.nr_migrations) {
-                        avg_per_cpu = div64_u64(avg_per_cpu,
-                                                p->se.nr_migrations);
-                } else {
-                        avg_per_cpu = -1LL;
-                }
-                __PN(avg_atom);
-                __PN(avg_per_cpu);
-        }
-#endif
-        __P(nr_switches);
-        SEQ_printf(m, "%-35s:%21Ld\n",
-                   "nr_voluntary_switches", (long long)p->nvcsw);
-        SEQ_printf(m, "%-35s:%21Ld\n",
-                   "nr_involuntary_switches", (long long)p->nivcsw);
-        P(se.load.weight);
-        P(policy);
-        P(prio);
-#undef PN
-#undef __PN
-#undef P
-#undef __P
-        {
-                unsigned int this_cpu = raw_smp_processor_id();
-                u64 t0, t1;
-                t0 = cpu_clock(this_cpu);
-                t1 = cpu_clock(this_cpu);
-                SEQ_printf(m, "%-35s:%21Ld\n",
-                           "clock-delta", (long long)(t1-t0));
-        }
-}
-void proc_sched_set_task(struct task_struct *p)
-{
-#ifdef CONFIG_SCHEDSTATS
-        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
-#endif
-}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
deleted file mode 100644
index 5eea8707234..00000000000
--- a/kernel/sched/fair.c
+++ /dev/null
@@ -1,6174 +0,0 @@
-/*
- * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
- *
- *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- *  Interactivity improvements by Mike Galbraith
- *  (C) 2007 Mike Galbraith <efault@gmx.de>
- *
- *  Various enhancements by Dmitry Adamushko.
- *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
- *
- *  Group scheduling enhancements by Srivatsa Vaddagiri
- *  Copyright IBM Corporation, 2007
- *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
- *
- *  Scaled math optimizations by Thomas Gleixner
- *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
- *
- *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
- *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- */
-#include <linux/latencytop.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/slab.h>
-#include <linux/profile.h>
-#include <linux/interrupt.h>
-#include <linux/mempolicy.h>
-#include <linux/migrate.h>
-#include <linux/task_work.h>
-#include <trace/events/sched.h>
-#include "sched.h"
-/*
- * Targeted preemption latency for CPU-bound tasks:
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
- *
- * NOTE: this latency value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS are of variable length
- * and have no persistent notion like in traditional, time-slice
- * based scheduling concepts.
- *
- * (to see the precise effective timeslice length of your workload,
- *  run vmstat and monitor the context-switches (cs) field)
- */
-unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
-/*
- * The initial- and re-scaling of tunables is configurable
- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
- *
- * Options are:
- * SCHED_TUNABLESCALING_NONE - unscaled, always *1
- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
- * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
- */
-enum sched_tunable_scaling sysctl_sched_tunable_scaling
-        = SCHED_TUNABLESCALING_LOG;
-/*
- * Minimal preemption granularity for CPU-bound tasks:
- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
-/*
- * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
- */
-static unsigned int sched_nr_latency = 8;
-/*
- * After fork, child runs first. If set to 0 (default) then
- * parent will (try to) run first.
- */
-unsigned int sysctl_sched_child_runs_first __read_mostly;
-/*
- * SCHED_OTHER wake-up granularity.
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- */
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
-/*
- * The exponential sliding  window over which load is averaged for shares
- * distribution.
- * (default: 10msec)
- */
-unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
-#ifdef CONFIG_CFS_BANDWIDTH
-/*
- * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
- * each time a cfs_rq requests quota.
- *
- * Note: in the case that the slice exceeds the runtime remaining (either due
- * to consumption or the quota being specified to be smaller than the slice)
- * we will always only issue the remaining available time.
- *
- * default: 5 msec, units: microseconds
-  */
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
-#endif
-/*
- * Increase the granularity value when there are more CPUs,
- * because with more CPUs the 'effective latency' as visible
- * to users decreases. But the relationship is not linear,
- * so pick a second-best guess by going with the log2 of the
- * number of CPUs.
- *
- * This idea comes from the SD scheduler of Con Kolivas:
- */
-static int get_update_sysctl_factor(void)
-{
-        unsigned int cpus = min_t(int, num_online_cpus(), 8);
-        unsigned int factor;
-        switch (sysctl_sched_tunable_scaling) {
-        case SCHED_TUNABLESCALING_NONE:
-                factor = 1;
-                break;
-        case SCHED_TUNABLESCALING_LINEAR:
-                factor = cpus;
-                break;
-        case SCHED_TUNABLESCALING_LOG:
-        default:
-                factor = 1 + ilog2(cpus);
-                break;
-        }
-        return factor;
-}
-static void update_sysctl(void)
-{
-        unsigned int factor = get_update_sysctl_factor();
-#define SET_SYSCTL(name) \
-        (sysctl_##name = (factor) * normalized_sysctl_##name)
-        SET_SYSCTL(sched_min_granularity);
-        SET_SYSCTL(sched_latency);
-        SET_SYSCTL(sched_wakeup_granularity);
-#undef SET_SYSCTL
-}
-void sched_init_granularity(void)
-{
-        update_sysctl();
-}
-#if BITS_PER_LONG == 32
-# define WMULT_CONST    (~0UL)
-#else
-# define WMULT_CONST    (1UL << 32)
-#endif
-#define WMULT_SHIFT     32
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
-/*
- * delta *= weight / lw
- */
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-                struct load_weight *lw)
-{
-        u64 tmp;
-        /*
-         * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-         * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-         * 2^SCHED_LOAD_RESOLUTION.
-         */
-        if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-                tmp = (u64)delta_exec * scale_load_down(weight);
-        else
-                tmp = (u64)delta_exec;
-        if (!lw->inv_weight) {
-                unsigned long w = scale_load_down(lw->weight);
-                if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
-                        lw->inv_weight = 1;
-                else if (unlikely(!w))
-                        lw->inv_weight = WMULT_CONST;
-                else
-                        lw->inv_weight = WMULT_CONST / w;
-        }
-        /*
-         * Check whether we'd overflow the 64-bit multiplication:
-         */
-        if (unlikely(tmp > WMULT_CONST))
-                tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
-                        WMULT_SHIFT/2);
-        else
-                tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
-        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
-}
-const struct sched_class fair_sched_class;
-/**************************************************************
- * CFS operations on generic schedulable entities:
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* cpu runqueue to which this cfs_rq is attached */
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
-        return cfs_rq->rq;
-}
-/* An entity is a task if it doesn't "own" a runqueue */
-#define entity_is_task(se)      (!se->my_q)
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-#ifdef CONFIG_SCHED_DEBUG
-        WARN_ON_ONCE(!entity_is_task(se));
-#endif
-        return container_of(se, struct task_struct, se);
-}
-/* Walk up scheduling entities hierarchy */
-#define for_each_sched_entity(se) \
-                for (; se; se = se->parent)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-        return p->se.cfs_rq;
-}
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-        return se->cfs_rq;
-}
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-        return grp->my_q;
-}
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
-                                       int force_update);
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
-        if (!cfs_rq->on_list) {
-                /*
-                 * Ensure we either appear before our parent (if already
-                 * enqueued) or force our parent to appear after us when it is
-                 * enqueued.  The fact that we always enqueue bottom-up
-                 * reduces this to two cases.
-                 */
-                if (cfs_rq->tg->parent &&
-                    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
-                        list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
-                                &rq_of(cfs_rq)->leaf_cfs_rq_list);
-                } else {
-                        list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
-                                &rq_of(cfs_rq)->leaf_cfs_rq_list);
-                }
-                cfs_rq->on_list = 1;
-                /* We should have no load, but we need to update last_decay. */
-                update_cfs_rq_blocked_load(cfs_rq, 0);
-        }
-}
-static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
-        if (cfs_rq->on_list) {
-                list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
-                cfs_rq->on_list = 0;
-        }
-}
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-        list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
-/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-        if (se->cfs_rq == pse->cfs_rq)
-                return 1;
-        return 0;
-}
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
-        return se->parent;
-}
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-        int depth = 0;
-        for_each_sched_entity(se)
-                depth++;
-        return depth;
-}
-static void
-find_matching_se(struct sched_entity **se, struct sched_entity **pse)
-{
-        int se_depth, pse_depth;
-        /*
-         * preemption test can be made between sibling entities who are in the
-         * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
-         * both tasks until we find their ancestors who are siblings of common
-         * parent.
-         */
-        /* First walk up until both entities are at same depth */
-        se_depth = depth_se(*se);
-        pse_depth = depth_se(*pse);
-        while (se_depth > pse_depth) {
-                se_depth--;
-                *se = parent_entity(*se);
-        }
-        while (pse_depth > se_depth) {
-                pse_depth--;
-                *pse = parent_entity(*pse);
-        }
-        while (!is_same_group(*se, *pse)) {
-                *se = parent_entity(*se);
-                *pse = parent_entity(*pse);
-        }
-}
-#else   /* !CONFIG_FAIR_GROUP_SCHED */
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-        return container_of(se, struct task_struct, se);
-}
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
-        return container_of(cfs_rq, struct rq, cfs);
-}
-#define entity_is_task(se)      1
-#define for_each_sched_entity(se) \
-                for (; se; se = NULL)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-        return &task_rq(p)->cfs;
-}
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-        struct task_struct *p = task_of(se);
-        struct rq *rq = task_rq(p);
-        return &rq->cfs;
-}
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-        return NULL;
-}
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
-}
-static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
-}
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-        return 1;
-}
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
-        return NULL;
-}
-static inline void
-find_matching_se(struct sched_entity **se, struct sched_entity **pse)
-{
-}
-#endif  /* CONFIG_FAIR_GROUP_SCHED */
-static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
-/**************************************************************
- * Scheduling class tree data structure manipulation methods:
- */
-static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
-{
-        s64 delta = (s64)(vruntime - min_vruntime);
-        if (delta > 0)
-                min_vruntime = vruntime;
-        return min_vruntime;
-}
-static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
-{
-        s64 delta = (s64)(vruntime - min_vruntime);
-        if (delta < 0)
-                min_vruntime = vruntime;
-        return min_vruntime;
-}
-static inline int entity_before(struct sched_entity *a,
-                                struct sched_entity *b)
-{
-        return (s64)(a->vruntime - b->vruntime) < 0;
-}
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
-{
-        u64 vruntime = cfs_rq->min_vruntime;
-        if (cfs_rq->curr)
-                vruntime = cfs_rq->curr->vruntime;
-        if (cfs_rq->rb_leftmost) {
-                struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
-                                                   struct sched_entity,
-                                                   run_node);
-                if (!cfs_rq->curr)
-                        vruntime = se->vruntime;
-                else
-                        vruntime = min_vruntime(vruntime, se->vruntime);
-        }
-        cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
-#ifndef CONFIG_64BIT
-        smp_wmb();
-        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
-}
-/*
- * Enqueue an entity into the rb-tree:
- */
-static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
-        struct rb_node *parent = NULL;
-        struct sched_entity *entry;
-        int leftmost = 1;
-        /*
-         * Find the right place in the rbtree:
-         */
-        while (*link) {
-                parent = *link;
-                entry = rb_entry(parent, struct sched_entity, run_node);
-                /*
-                 * We dont care about collisions. Nodes with
-                 * the same key stay together.
-                 */
-                if (entity_before(se, entry)) {
-                        link = &parent->rb_left;
-                } else {
-                        link = &parent->rb_right;
-                        leftmost = 0;
-                }
-        }
-        /*
-         * Maintain a cache of leftmost tree entries (it is frequently
-         * used):
-         */
-        if (leftmost)
-                cfs_rq->rb_leftmost = &se->run_node;
-        rb_link_node(&se->run_node, parent, link);
-        rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
-}
-static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        if (cfs_rq->rb_leftmost == &se->run_node) {
-                struct rb_node *next_node;
-                next_node = rb_next(&se->run_node);
-                cfs_rq->rb_leftmost = next_node;
-        }
-        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
-}
-struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
-{
-        struct rb_node *left = cfs_rq->rb_leftmost;
-        if (!left)
-                return NULL;
-        return rb_entry(left, struct sched_entity, run_node);
-}
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
-{
-        struct rb_node *next = rb_next(&se->run_node);
-        if (!next)
-                return NULL;
-        return rb_entry(next, struct sched_entity, run_node);
-}
-#ifdef CONFIG_SCHED_DEBUG
-struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
-{
-        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
-        if (!last)
-                return NULL;
-        return rb_entry(last, struct sched_entity, run_node);
-}
-/**************************************************************
- * Scheduling class statistics methods:
- */
-int sched_proc_update_handler(struct ctl_table *table, int write,
-                void __user *buffer, size_t *lenp,
-                loff_t *ppos)
-{
-        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-        int factor = get_update_sysctl_factor();
-        if (ret || !write)
-                return ret;
-        sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
-                                        sysctl_sched_min_granularity);
-#define WRT_SYSCTL(name) \
-        (normalized_sysctl_##name = sysctl_##name / (factor))
-        WRT_SYSCTL(sched_min_granularity);
-        WRT_SYSCTL(sched_latency);
-        WRT_SYSCTL(sched_wakeup_granularity);
-#undef WRT_SYSCTL
-        return 0;
-}
-#endif
-/*
- * delta /= w
- */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
-{
-        if (unlikely(se->load.weight != NICE_0_LOAD))
-                delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
-        return delta;
-}
-/*
- * The idea is to set a period in which each task runs once.
- *
- * When there are too many tasks (sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
- *
- * p = (nr <= nl) ? l : l*nr/nl
- */
-static u64 __sched_period(unsigned long nr_running)
-{
-        u64 period = sysctl_sched_latency;
-        unsigned long nr_latency = sched_nr_latency;
-        if (unlikely(nr_running > nr_latency)) {
-                period = sysctl_sched_min_granularity;
-                period *= nr_running;
-        }
-        return period;
-}
-/*
- * We calculate the wall-time slice from the period by taking a part
- * proportional to the weight.
- *
- * s = p*P[w/rw]
- */
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
-        for_each_sched_entity(se) {
-                struct load_weight *load;
-                struct load_weight lw;
-                cfs_rq = cfs_rq_of(se);
-                load = &cfs_rq->load;
-                if (unlikely(!se->on_rq)) {
-                        lw = cfs_rq->load;
-                        update_load_add(&lw, se->load.weight);
-                        load = &lw;
-                }
-                slice = calc_delta_mine(slice, se->load.weight, load);
-        }
-        return slice;
-}
-/*
- * We calculate the vruntime slice of a to be inserted task
- *
- * vs = s/w
- */
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        return calc_delta_fair(sched_slice(cfs_rq, se), se);
-}
-/*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
- */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-              unsigned long delta_exec)
-{
-        unsigned long delta_exec_weighted;
-        schedstat_set(curr->statistics.exec_max,
-                      max((u64)delta_exec, curr->statistics.exec_max));
-        curr->sum_exec_runtime += delta_exec;
-        schedstat_add(cfs_rq, exec_clock, delta_exec);
-        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-        curr->vruntime += delta_exec_weighted;
-        update_min_vruntime(cfs_rq);
-}
-static void update_curr(struct cfs_rq *cfs_rq)
-{
-        struct sched_entity *curr = cfs_rq->curr;
-        u64 now = rq_of(cfs_rq)->clock_task;
-        unsigned long delta_exec;
-        if (unlikely(!curr))
-                return;
-        /*
-         * Get the amount of time the current task was running
-         * since the last time we changed load (this cannot
-         * overflow on 32 bits):
-         */
-        delta_exec = (unsigned long)(now - curr->exec_start);
-        if (!delta_exec)
-                return;
-        __update_curr(cfs_rq, curr, delta_exec);
-        curr->exec_start = now;
-        if (entity_is_task(curr)) {
-                struct task_struct *curtask = task_of(curr);
-                trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
-                cpuacct_charge(curtask, delta_exec);
-                account_group_exec_runtime(curtask, delta_exec);
-        }
-        account_cfs_rq_runtime(cfs_rq, delta_exec);
-}
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
-}
-/*
- * Task is being enqueued - update stats:
- */
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        /*
-         * Are we enqueueing a waiting task? (for current tasks
-         * a dequeue/enqueue event is a NOP)
-         */
-        if (se != cfs_rq->curr)
-                update_stats_wait_start(cfs_rq, se);
-}
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
-                        rq_of(cfs_rq)->clock - se->statistics.wait_start));
-        schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
-        schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
-                        rq_of(cfs_rq)->clock - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
-        if (entity_is_task(se)) {
-                trace_sched_stat_wait(task_of(se),
-                        rq_of(cfs_rq)->clock - se->statistics.wait_start);
-        }
-#endif
-        schedstat_set(se->statistics.wait_start, 0);
-}
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        /*
-         * Mark the end of the wait period if dequeueing a
-         * waiting task:
-         */
-        if (se != cfs_rq->curr)
-                update_stats_wait_end(cfs_rq, se);
-}
-/*
- * We are picking a new current task - update its stats:
- */
-static inline void
-update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        /*
-         * We are starting a new run period:
-         */
-        se->exec_start = rq_of(cfs_rq)->clock_task;
-}
-/**************************************************
- * Scheduling class queueing methods:
- */
-#ifdef CONFIG_NUMA_BALANCING
-/*
- * numa task sample period in ms
- */
-unsigned int sysctl_numa_balancing_scan_period_min = 100;
-unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
-unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
-/* Portion of address space to scan in MB */
-unsigned int sysctl_numa_balancing_scan_size = 256;
-/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
-unsigned int sysctl_numa_balancing_scan_delay = 1000;
-static void task_numa_placement(struct task_struct *p)
-{
-        int seq;
-        if (!p->mm)     /* for example, ksmd faulting in a user's mm */
-                return;
-        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
-        if (p->numa_scan_seq == seq)
-                return;
-        p->numa_scan_seq = seq;
-        /* FIXME: Scheduling placement policy hints go here */
-}
-/*
- * Got a PROT_NONE fault for a page on @node.
- */
-void task_numa_fault(int node, int pages, bool migrated)
-{
-        struct task_struct *p = current;
-        if (!sched_feat_numa(NUMA))
-                return;
-        /* FIXME: Allocate task-specific structure for placement policy here */
-        /*
-         * If pages are properly placed (did not migrate) then scan slower.
-         * This is reset periodically in case of phase changes
-         */
-        if (!migrated)
-                p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
-                        p->numa_scan_period + jiffies_to_msecs(10));
-        task_numa_placement(p);
-}
-static void reset_ptenuma_scan(struct task_struct *p)
-{
-        ACCESS_ONCE(p->mm->numa_scan_seq)++;
-        p->mm->numa_scan_offset = 0;
-}
-/*
- * The expensive part of numa migration is done from task_work context.
- * Triggered from task_tick_numa().
- */
-void task_numa_work(struct callback_head *work)
-{
-        unsigned long migrate, next_scan, now = jiffies;
-        struct task_struct *p = current;
-        struct mm_struct *mm = p->mm;
-        struct vm_area_struct *vma;
-        unsigned long start, end;
-        long pages;
-        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
-        work->next = work; /* protect against double add */
-        /*
-         * Who cares about NUMA placement when they're dying.
-         *
-         * NOTE: make sure not to dereference p->mm before this check,
-         * exit_task_work() happens _after_ exit_mm() so we could be called
-         * without p->mm even though we still had it when we enqueued this
-         * work.
-         */
-        if (p->flags & PF_EXITING)
-                return;
-        /*
-         * We do not care about task placement until a task runs on a node
-         * other than the first one used by the address space. This is
-         * largely because migrations are driven by what CPU the task
-         * is running on. If it's never scheduled on another node, it'll
-         * not migrate so why bother trapping the fault.
-         */
-        if (mm->first_nid == NUMA_PTE_SCAN_INIT)
-                mm->first_nid = numa_node_id();
-        if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
-                /* Are we running on a new node yet? */
-                if (numa_node_id() == mm->first_nid &&
-                    !sched_feat_numa(NUMA_FORCE))
-                        return;
-                mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
-        }
-        /*
-         * Reset the scan period if enough time has gone by. Objective is that
-         * scanning will be reduced if pages are properly placed. As tasks
-         * can enter different phases this needs to be re-examined. Lacking
-         * proper tracking of reference behaviour, this blunt hammer is used.
-         */
-        migrate = mm->numa_next_reset;
-        if (time_after(now, migrate)) {
-                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-                next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
-                xchg(&mm->numa_next_reset, next_scan);
-        }
-        /*
-         * Enforce maximal scan/migration frequency..
-         */
-        migrate = mm->numa_next_scan;
-        if (time_before(now, migrate))
-                return;
-        if (p->numa_scan_period == 0)
-                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-        next_scan = now + msecs_to_jiffies(p->numa_scan_period);
-        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
-                return;
-        /*
-         * Do not set pte_numa if the current running node is rate-limited.
-         * This loses statistics on the fault but if we are unwilling to
-         * migrate to this node, it is less likely we can do useful work
-         */
-        if (migrate_ratelimited(numa_node_id()))
-                return;
-        start = mm->numa_scan_offset;
-        pages = sysctl_numa_balancing_scan_size;
-        pages <<= 20 - PAGE_SHIFT; /* MB in pages */
-        if (!pages)
-                return;
-        down_read(&mm->mmap_sem);
-        vma = find_vma(mm, start);
-        if (!vma) {
-                reset_ptenuma_scan(p);
-                start = 0;
-                vma = mm->mmap;
-        }
-        for (; vma; vma = vma->vm_next) {
-                if (!vma_migratable(vma))
-                        continue;
-                /* Skip small VMAs. They are not likely to be of relevance */
-                if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
-                        continue;
-                do {
-                        start = max(start, vma->vm_start);
-                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
-                        end = min(end, vma->vm_end);
-                        pages -= change_prot_numa(vma, start, end);
-                        start = end;
-                        if (pages <= 0)
-                                goto out;
-                } while (end != vma->vm_end);
-        }
-out:
-        /*
-         * It is possible to reach the end of the VMA list but the last few VMAs are
-         * not guaranteed to the vma_migratable. If they are not, we would find the
-         * !migratable VMA on the next scan but not reset the scanner to the start
-         * so check it now.
-         */
-        if (vma)
-                mm->numa_scan_offset = start;
-        else
-                reset_ptenuma_scan(p);
-        up_read(&mm->mmap_sem);
-}
-/*
- * Drive the periodic memory faults..
- */
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
-{
-        struct callback_head *work = &curr->numa_work;
-        u64 period, now;
-        /*
-         * We don't care about NUMA placement if we don't have memory.
-         */
-        if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
-                return;
-        /*
-         * Using runtime rather than walltime has the dual advantage that
-         * we (mostly) drive the selection from busy threads and that the
-         * task needs to have done some actual work before we bother with
-         * NUMA placement.
-         */
-        now = curr->se.sum_exec_runtime;
-        period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
-        if (now - curr->node_stamp > period) {
-                if (!curr->node_stamp)
-                        curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-                curr->node_stamp = now;
-                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
-                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
-                        task_work_add(curr, work, true);
-                }
-        }
-}
-#else
-static void task_tick_numa(struct rq *rq, struct task_struct *curr)
-{
-}
-#endif /* CONFIG_NUMA_BALANCING */
-static void
-account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        update_load_add(&cfs_rq->load, se->load.weight);
-        if (!parent_entity(se))
-                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
-#ifdef CONFIG_SMP
-        if (entity_is_task(se))
-                list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
-#endif
-        cfs_rq->nr_running++;
-}
-static void
-account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        update_load_sub(&cfs_rq->load, se->load.weight);
-        if (!parent_entity(se))
-                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-        if (entity_is_task(se))
-                list_del_init(&se->group_node);
-        cfs_rq->nr_running--;
-}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-# ifdef CONFIG_SMP
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
-{
-        long tg_weight;
-        /*
-         * Use this CPU's actual weight instead of the last load_contribution
-         * to gain a more accurate current total weight. See
-         * update_cfs_rq_load_contribution().
-         */
-        tg_weight = atomic64_read(&tg->load_avg);
-        tg_weight -= cfs_rq->tg_load_contrib;
-        tg_weight += cfs_rq->load.weight;
-        return tg_weight;
-}
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
-        long tg_weight, load, shares;
-        tg_weight = calc_tg_weight(tg, cfs_rq);
-        load = cfs_rq->load.weight;
-        shares = (tg->shares * load);
-        if (tg_weight)
-                shares /= tg_weight;
-        if (shares < MIN_SHARES)
-                shares = MIN_SHARES;
-        if (shares > tg->shares)
-                shares = tg->shares;
-        return shares;
-}
-# else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
-        return tg->shares;
-}
-# endif /* CONFIG_SMP */
-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-                            unsigned long weight)
-{
-        if (se->on_rq) {
-                /* commit outstanding execution time */
-                if (cfs_rq->curr == se)
-                        update_curr(cfs_rq);
-                account_entity_dequeue(cfs_rq, se);
-        }
-        update_load_set(&se->load, weight);
-        if (se->on_rq)
-                account_entity_enqueue(cfs_rq, se);
-}
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
-{
-        struct task_group *tg;
-        struct sched_entity *se;
-        long shares;
-        tg = cfs_rq->tg;
-        se = tg->se[cpu_of(rq_of(cfs_rq))];
-        if (!se || throttled_hierarchy(cfs_rq))
-                return;
-#ifndef CONFIG_SMP
-        if (likely(se->load.weight == tg->shares))
-                return;
-#endif
-        shares = calc_cfs_shares(cfs_rq, tg);
-        reweight_entity(cfs_rq_of(se), se, shares);
-}
-#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
-{
-}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables below are dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
-static const u32 runnable_avg_yN_inv[] = {
-        0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
-        0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
-        0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
-        0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
-        0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
-        0x85aac367, 0x82cd8698,
-};
-/*
- * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
- * over-estimates when re-combining.
- */
-static const u32 runnable_avg_yN_sum[] = {
-            0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
-         9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
-        17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
-};
-/*
- * Approximate:
- *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
- */
-static __always_inline u64 decay_load(u64 val, u64 n)
-{
-        unsigned int local_n;
-        if (!n)
-                return val;
-        else if (unlikely(n > LOAD_AVG_PERIOD * 63))
-                return 0;
-        /* after bounds checking we can collapse to 32-bit */
-        local_n = n;
-        /*
-         * As y^PERIOD = 1/2, we can combine
-         *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
-         * With a look-up table which covers k^n (n<PERIOD)
-         *
-         * To achieve constant time decay_load.
-         */
-        if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
-                val >>= local_n / LOAD_AVG_PERIOD;
-                local_n %= LOAD_AVG_PERIOD;
-        }
-        val *= runnable_avg_yN_inv[local_n];
-        /* We don't use SRR here since we always want to round down. */
-        return val >> 32;
-}
-/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
- *
- * We can compute this reasonably efficiently by combining:
- *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
- */
-static u32 __compute_runnable_contrib(u64 n)
-{
-        u32 contrib = 0;
-        if (likely(n <= LOAD_AVG_PERIOD))
-                return runnable_avg_yN_sum[n];
-        else if (unlikely(n >= LOAD_AVG_MAX_N))
-                return LOAD_AVG_MAX;
-        /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
-        do {
-                contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
-                contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
-                n -= LOAD_AVG_PERIOD;
-        } while (n > LOAD_AVG_PERIOD);
-        contrib = decay_load(contrib, n);
-        return contrib + runnable_avg_yN_sum[n];
-}
-/*
- * We can represent the historical contribution to runnable average as the
- * coefficients of a geometric series.  To do this we sub-divide our runnable
- * history into segments of approximately 1ms (1024us); label the segment that
- * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
- *
- * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
- *      p0            p1           p2
- *     (now)       (~1ms ago)  (~2ms ago)
- *
- * Let u_i denote the fraction of p_i that the entity was runnable.
- *
- * We then designate the fractions u_i as our co-efficients, yielding the
- * following representation of historical load:
- *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
- *
- * We choose y based on the with of a reasonably scheduling period, fixing:
- *   y^32 = 0.5
- *
- * This means that the contribution to load ~32ms ago (u_32) will be weighted
- * approximately half as much as the contribution to load within the last ms
- * (u_0).
- *
- * When a period "rolls over" and we have new u_0`, multiplying the previous
- * sum again by y is sufficient to update:
- *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
- *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
- */
-static __always_inline int __update_entity_runnable_avg(u64 now,
-                                                        struct sched_avg *sa,
-                                                        int runnable)
-{
-        u64 delta, periods;
-        u32 runnable_contrib;
-        int delta_w, decayed = 0;
-        delta = now - sa->last_runnable_update;
-        /*
-         * This should only happen when time goes backwards, which it
-         * unfortunately does during sched clock init when we swap over to TSC.
-         */
-        if ((s64)delta < 0) {
-                sa->last_runnable_update = now;
-                return 0;
-        }
-        /*
-         * Use 1024ns as the unit of measurement since it's a reasonable
-         * approximation of 1us and fast to compute.
-         */
-        delta >>= 10;
-        if (!delta)
-                return 0;
-        sa->last_runnable_update = now;
-        /* delta_w is the amount already accumulated against our next period */
-        delta_w = sa->runnable_avg_period % 1024;
-        if (delta + delta_w >= 1024) {
-                /* period roll-over */
-                decayed = 1;
-                /*
-                 * Now that we know we're crossing a period boundary, figure
-                 * out how much from delta we need to complete the current
-                 * period and accrue it.
-                 */
-                delta_w = 1024 - delta_w;
-                if (runnable)
-                        sa->runnable_avg_sum += delta_w;
-                sa->runnable_avg_period += delta_w;
-                delta -= delta_w;
-                /* Figure out how many additional periods this update spans */
-                periods = delta / 1024;
-                delta %= 1024;
-                sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
-                                                  periods + 1);
-                sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
-                                                     periods + 1);
-                /* Efficiently calculate \sum (1..n_period) 1024*y^i */
-                runnable_contrib = __compute_runnable_contrib(periods);
-                if (runnable)
-                        sa->runnable_avg_sum += runnable_contrib;
-                sa->runnable_avg_period += runnable_contrib;
-        }
-        /* Remainder of delta accrued against u_0` */
-        if (runnable)
-                sa->runnable_avg_sum += delta;
-        sa->runnable_avg_period += delta;
-        return decayed;
-}
-/* Synchronize an entity's decay with its parenting cfs_rq.*/
-static inline u64 __synchronize_entity_decay(struct sched_entity *se)
-{
-        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        u64 decays = atomic64_read(&cfs_rq->decay_counter);
-        decays -= se->avg.decay_count;
-        if (!decays)
-                return 0;
-        se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
-        se->avg.decay_count = 0;
-        return decays;
-}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
-                                                 int force_update)
-{
-        struct task_group *tg = cfs_rq->tg;
-        s64 tg_contrib;
-        tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
-        tg_contrib -= cfs_rq->tg_load_contrib;
-        if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
-                atomic64_add(tg_contrib, &tg->load_avg);
-                cfs_rq->tg_load_contrib += tg_contrib;
-        }
-}
-/*
- * Aggregate cfs_rq runnable averages into an equivalent task_group
- * representation for computing load contributions.
- */
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
-                                                  struct cfs_rq *cfs_rq)
-{
-        struct task_group *tg = cfs_rq->tg;
-        long contrib;
-        /* The fraction of a cpu used by this cfs_rq */
-        contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
-                          sa->runnable_avg_period + 1);
-        contrib -= cfs_rq->tg_runnable_contrib;
-        if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
-                atomic_add(contrib, &tg->runnable_avg);
-                cfs_rq->tg_runnable_contrib += contrib;
-        }
-}
-static inline void __update_group_entity_contrib(struct sched_entity *se)
-{
-        struct cfs_rq *cfs_rq = group_cfs_rq(se);
-        struct task_group *tg = cfs_rq->tg;
-        int runnable_avg;
-        u64 contrib;
-        contrib = cfs_rq->tg_load_contrib * tg->shares;
-        se->avg.load_avg_contrib = div64_u64(contrib,
-                                             atomic64_read(&tg->load_avg) + 1);
-        /*
-         * For group entities we need to compute a correction term in the case
-         * that they are consuming <1 cpu so that we would contribute the same
-         * load as a task of equal weight.
-         *
-         * Explicitly co-ordinating this measurement would be expensive, but
-         * fortunately the sum of each cpus contribution forms a usable
-         * lower-bound on the true value.
-         *
-         * Consider the aggregate of 2 contributions.  Either they are disjoint
-         * (and the sum represents true value) or they are disjoint and we are
-         * understating by the aggregate of their overlap.
-         *
-         * Extending this to N cpus, for a given overlap, the maximum amount we
-         * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
-         * cpus that overlap for this interval and w_i is the interval width.
-         *
-         * On a small machine; the first term is well-bounded which bounds the
-         * total error since w_i is a subset of the period.  Whereas on a
-         * larger machine, while this first term can be larger, if w_i is the
-         * of consequential size guaranteed to see n_i*w_i quickly converge to
-         * our upper bound of 1-cpu.
-         */
-        runnable_avg = atomic_read(&tg->runnable_avg);
-        if (runnable_avg < NICE_0_LOAD) {
-                se->avg.load_avg_contrib *= runnable_avg;
-                se->avg.load_avg_contrib >>= NICE_0_SHIFT;
-        }
-}
-#else
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
-                                                 int force_update) {}
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
-                                                  struct cfs_rq *cfs_rq) {}
-static inline void __update_group_entity_contrib(struct sched_entity *se) {}
-#endif
-static inline void __update_task_entity_contrib(struct sched_entity *se)
-{
-        u32 contrib;
-        /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
-        contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
-        contrib /= (se->avg.runnable_avg_period + 1);
-        se->avg.load_avg_contrib = scale_load(contrib);
-}
-/* Compute the current contribution to load_avg by se, return any delta */
-static long __update_entity_load_avg_contrib(struct sched_entity *se)
-{
-        long old_contrib = se->avg.load_avg_contrib;
-        if (entity_is_task(se)) {
-                __update_task_entity_contrib(se);
-        } else {
-                __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
-                __update_group_entity_contrib(se);
-        }
-        return se->avg.load_avg_contrib - old_contrib;
-}
-static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
-                                                 long load_contrib)
-{
-        if (likely(load_contrib < cfs_rq->blocked_load_avg))
-                cfs_rq->blocked_load_avg -= load_contrib;
-        else
-                cfs_rq->blocked_load_avg = 0;
-}
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-/* Update a sched_entity's runnable average */
-static inline void update_entity_load_avg(struct sched_entity *se,
-                                          int update_cfs_rq)
-{
-        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        long contrib_delta;
-        u64 now;
-        /*
-         * For a group entity we need to use their owned cfs_rq_clock_task() in
-         * case they are the parent of a throttled hierarchy.
-         */
-        if (entity_is_task(se))
-                now = cfs_rq_clock_task(cfs_rq);
-        else
-                now = cfs_rq_clock_task(group_cfs_rq(se));
-        if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
-                return;
-        contrib_delta = __update_entity_load_avg_contrib(se);
-        if (!update_cfs_rq)
-                return;
-        if (se->on_rq)
-                cfs_rq->runnable_load_avg += contrib_delta;
-        else
-                subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
-}
-/*
- * Decay the load contributed by all blocked children and account this so that
- * their contribution may appropriately discounted when they wake up.
- */
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
-{
-        u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
-        u64 decays;
-        decays = now - cfs_rq->last_decay;
-        if (!decays && !force_update)
-                return;
-        if (atomic64_read(&cfs_rq->removed_load)) {
-                u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
-                subtract_blocked_load_contrib(cfs_rq, removed_load);
-        }
-        if (decays) {
-                cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
-                                                      decays);
-                atomic64_add(decays, &cfs_rq->decay_counter);
-                cfs_rq->last_decay = now;
-        }
-        __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
-}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
-{
-        __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
-        __update_tg_runnable_avg(&rq->avg, &rq->cfs);
-}
-/* Add the load generated by se into cfs_rq's child load-average */
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
-                                                  struct sched_entity *se,
-                                                  int wakeup)
-{
-        /*
-         * We track migrations using entity decay_count <= 0, on a wake-up
-         * migration we use a negative decay count to track the remote decays
-         * accumulated while sleeping.
-         */
-        if (unlikely(se->avg.decay_count <= 0)) {
-                se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
-                if (se->avg.decay_count) {
-                        /*
-                         * In a wake-up migration we have to approximate the
-                         * time sleeping.  This is because we can't synchronize
-                         * clock_task between the two cpus, and it is not
-                         * guaranteed to be read-safe.  Instead, we can
-                         * approximate this using our carried decays, which are
-                         * explicitly atomically readable.
-                         */
-                        se->avg.last_runnable_update -= (-se->avg.decay_count)
-                                                        << 20;
-                        update_entity_load_avg(se, 0);
-                        /* Indicate that we're now synchronized and on-rq */
-                        se->avg.decay_count = 0;
-                }
-                wakeup = 0;
-        } else {
-                __synchronize_entity_decay(se);
-        }
-        /* migrated tasks did not contribute to our blocked load */
-        if (wakeup) {
-                subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
-                update_entity_load_avg(se, 0);
-        }
-        cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
-        /* we force update consideration on load-balancer moves */
-        update_cfs_rq_blocked_load(cfs_rq, !wakeup);
-}
-/*
- * Remove se's load from this cfs_rq child load-average, if the entity is
- * transitioning to a blocked state we track its projected decay using
- * blocked_load_avg.
- */
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
-                                                  struct sched_entity *se,
-                                                  int sleep)
-{
-        update_entity_load_avg(se, 1);
-        /* we force update consideration on load-balancer moves */
-        update_cfs_rq_blocked_load(cfs_rq, !sleep);
-        cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
-        if (sleep) {
-                cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
-                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
-        } /* migrations, e.g. sleep=0 leave decay_count == 0 */
-}
-#else
-static inline void update_entity_load_avg(struct sched_entity *se,
-                                          int update_cfs_rq) {}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
-                                           struct sched_entity *se,
-                                           int wakeup) {}
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
-                                           struct sched_entity *se,
-                                           int sleep) {}
-static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
-                                              int force_update) {}
-#endif
-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHEDSTATS
-        struct task_struct *tsk = NULL;
-        if (entity_is_task(se))
-                tsk = task_of(se);
-        if (se->statistics.sleep_start) {
-                u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
-                if ((s64)delta < 0)
-                        delta = 0;
-                if (unlikely(delta > se->statistics.sleep_max))
-                        se->statistics.sleep_max = delta;
-                se->statistics.sleep_start = 0;
-                se->statistics.sum_sleep_runtime += delta;
-                if (tsk) {
-                        account_scheduler_latency(tsk, delta >> 10, 1);
-                        trace_sched_stat_sleep(tsk, delta);
-                }
-        }
-        if (se->statistics.block_start) {
-                u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
-                if ((s64)delta < 0)
-                        delta = 0;
-                if (unlikely(delta > se->statistics.block_max))
-                        se->statistics.block_max = delta;
-                se->statistics.block_start = 0;
-                se->statistics.sum_sleep_runtime += delta;
-                if (tsk) {
-                        if (tsk->in_iowait) {
-                                se->statistics.iowait_sum += delta;
-                                se->statistics.iowait_count++;
-                                trace_sched_stat_iowait(tsk, delta);
-                        }
-                        trace_sched_stat_blocked(tsk, delta);
-                        /*
-                         * Blocking time is in units of nanosecs, so shift by
-                         * 20 to get a milliseconds-range estimation of the
-                         * amount of time that the task spent sleeping:
-                         */
-                        if (unlikely(prof_on == SLEEP_PROFILING)) {
-                                profile_hits(SLEEP_PROFILING,
-                                                (void *)get_wchan(tsk),
-                                                delta >> 20);
-                        }
-                        account_scheduler_latency(tsk, delta >> 10, 0);
-                }
-        }
-#endif
-}
-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHED_DEBUG
-        s64 d = se->vruntime - cfs_rq->min_vruntime;
-        if (d < 0)
-                d = -d;
-        if (d > 3*sysctl_sched_latency)
-                schedstat_inc(cfs_rq, nr_spread_over);
-#endif
-}
-static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
-{
-        u64 vruntime = cfs_rq->min_vruntime;
-        /*
-         * The 'current' period is already promised to the current tasks,
-         * however the extra weight of the new task will slow them down a
-         * little, place the new task so that it fits in the slot that
-         * stays open at the end.
-         */
-        if (initial && sched_feat(START_DEBIT))
-                vruntime += sched_vslice(cfs_rq, se);
-        /* sleeps up to a single latency don't count. */
-        if (!initial) {
-                unsigned long thresh = sysctl_sched_latency;
-                /*
-                 * Halve their sleep time's effect, to allow
-                 * for a gentler effect of sleepers:
-                 */
-                if (sched_feat(GENTLE_FAIR_SLEEPERS))
-                        thresh >>= 1;
-                vruntime -= thresh;
-        }
-        /* ensure we never gain time by being placed backwards. */
-        vruntime = max_vruntime(se->vruntime, vruntime);
-        se->vruntime = vruntime;
-}
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
-        /*
-         * Update the normalized vruntime before updating min_vruntime
-         * through callig update_curr().
-         */
-        if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
-                se->vruntime += cfs_rq->min_vruntime;
-        /*
-         * Update run-time statistics of the 'current'.
-         */
-        update_curr(cfs_rq);
-        enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
-        account_entity_enqueue(cfs_rq, se);
-        update_cfs_shares(cfs_rq);
-        if (flags & ENQUEUE_WAKEUP) {
-                place_entity(cfs_rq, se, 0);
-                enqueue_sleeper(cfs_rq, se);
-        }
-        update_stats_enqueue(cfs_rq, se);
-        check_spread(cfs_rq, se);
-        if (se != cfs_rq->curr)
-                __enqueue_entity(cfs_rq, se);
-        se->on_rq = 1;
-        if (cfs_rq->nr_running == 1) {
-                list_add_leaf_cfs_rq(cfs_rq);
-                check_enqueue_throttle(cfs_rq);
-        }
-}
-static void __clear_buddies_last(struct sched_entity *se)
-{
-        for_each_sched_entity(se) {
-                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-                if (cfs_rq->last == se)
-                        cfs_rq->last = NULL;
-                else
-                        break;
-        }
-}
-static void __clear_buddies_next(struct sched_entity *se)
-{
-        for_each_sched_entity(se) {
-                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-                if (cfs_rq->next == se)
-                        cfs_rq->next = NULL;
-                else
-                        break;
-        }
-}
-static void __clear_buddies_skip(struct sched_entity *se)
-{
-        for_each_sched_entity(se) {
-                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-                if (cfs_rq->skip == se)
-                        cfs_rq->skip = NULL;
-                else
-                        break;
-        }
-}
-static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        if (cfs_rq->last == se)
-                __clear_buddies_last(se);
-        if (cfs_rq->next == se)
-                __clear_buddies_next(se);
-        if (cfs_rq->skip == se)
-                __clear_buddies_skip(se);
-}
-static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
-        /*
-         * Update run-time statistics of the 'current'.
-         */
-        update_curr(cfs_rq);
-        dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
-        update_stats_dequeue(cfs_rq, se);
-        if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
-                if (entity_is_task(se)) {
-                        struct task_struct *tsk = task_of(se);
-                        if (tsk->state & TASK_INTERRUPTIBLE)
-                                se->statistics.sleep_start = rq_of(cfs_rq)->clock;
-                        if (tsk->state & TASK_UNINTERRUPTIBLE)
-                                se->statistics.block_start = rq_of(cfs_rq)->clock;
-                }
-#endif
-        }
-        clear_buddies(cfs_rq, se);
-        if (se != cfs_rq->curr)
-                __dequeue_entity(cfs_rq, se);
-        se->on_rq = 0;
-        account_entity_dequeue(cfs_rq, se);
-        /*
-         * Normalize the entity after updating the min_vruntime because the
-         * update can refer to the ->curr item and we need to reflect this
-         * movement in our normalized position.
-         */
-        if (!(flags & DEQUEUE_SLEEP))
-                se->vruntime -= cfs_rq->min_vruntime;
-        /* return excess runtime on last dequeue */
-        return_cfs_rq_runtime(cfs_rq);
-        update_min_vruntime(cfs_rq);
-        update_cfs_shares(cfs_rq);
-}
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
-        unsigned long ideal_runtime, delta_exec;
-        struct sched_entity *se;
-        s64 delta;
-        ideal_runtime = sched_slice(cfs_rq, curr);
-        delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-        if (delta_exec > ideal_runtime) {
-                resched_task(rq_of(cfs_rq)->curr);
-                /*
-                 * The current task ran long enough, ensure it doesn't get
-                 * re-elected due to buddy favours.
-                 */
-                clear_buddies(cfs_rq, curr);
-                return;
-        }
-        /*
-         * Ensure that a task that missed wakeup preemption by a
-         * narrow margin doesn't have to wait for a full slice.
-         * This also mitigates buddy induced latencies under load.
-         */
-        if (delta_exec < sysctl_sched_min_granularity)
-                return;
-        se = __pick_first_entity(cfs_rq);
-        delta = curr->vruntime - se->vruntime;
-        if (delta < 0)
-                return;
-        if (delta > ideal_runtime)
-                resched_task(rq_of(cfs_rq)->curr);
-}
-static void
-set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        /* 'current' is not kept within the tree. */
-        if (se->on_rq) {
-                /*
-                 * Any task has to be enqueued before it get to execute on
-                 * a CPU. So account for the time it spent waiting on the
-                 * runqueue.
-                 */
-                update_stats_wait_end(cfs_rq, se);
-                __dequeue_entity(cfs_rq, se);
-        }
-        update_stats_curr_start(cfs_rq, se);
-        cfs_rq->curr = se;
-#ifdef CONFIG_SCHEDSTATS
-        /*
-         * Track our maximum slice length, if the CPU's load is at
-         * least twice that of our own weight (i.e. dont track it
-         * when there are only lesser-weight tasks around):
-         */
-        if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-                se->statistics.slice_max = max(se->statistics.slice_max,
-                        se->sum_exec_runtime - se->prev_sum_exec_runtime);
-        }
-#endif
-        se->prev_sum_exec_runtime = se->sum_exec_runtime;
-}
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-/*
- * Pick the next process, keeping these things in mind, in this order:
- * 1) keep things fair between processes/task groups
- * 2) pick the "next" process, since someone really wants that to run
- * 3) pick the "last" process, for cache locality
- * 4) do not run the "skip" process, if something else is available
- */
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
-{
-        struct sched_entity *se = __pick_first_entity(cfs_rq);
-        struct sched_entity *left = se;
-        /*
-         * Avoid running the skip buddy, if running something else can
-         * be done without getting too unfair.
-         */
-        if (cfs_rq->skip == se) {
-                struct sched_entity *second = __pick_next_entity(se);
-                if (second && wakeup_preempt_entity(second, left) < 1)
-                        se = second;
-        }
-        /*
-         * Prefer last buddy, try to return the CPU to a preempted task.
-         */
-        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
-                se = cfs_rq->last;
-        /*
-         * Someone really wants this to run. If it's not unfair, run it.
-         */
-        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
-                se = cfs_rq->next;
-        clear_buddies(cfs_rq, se);
-        return se;
-}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
-{
-        /*
-         * If still on the runqueue then deactivate_task()
-         * was not called and update_curr() has to be done:
-         */
-        if (prev->on_rq)
-                update_curr(cfs_rq);
-        /* throttle cfs_rqs exceeding runtime */
-        check_cfs_rq_runtime(cfs_rq);
-        check_spread(cfs_rq, prev);
-        if (prev->on_rq) {
-                update_stats_wait_start(cfs_rq, prev);
-                /* Put 'current' back into the tree. */
-                __enqueue_entity(cfs_rq, prev);
-                /* in !on_rq case, update occurred at dequeue */
-                update_entity_load_avg(prev, 1);
-        }
-        cfs_rq->curr = NULL;
-}
-static void
-entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
-{
-        /*
-         * Update run-time statistics of the 'current'.
-         */
-        update_curr(cfs_rq);
-        /*
-         * Ensure that runnable average is periodically updated.
-         */
-        update_entity_load_avg(curr, 1);
-        update_cfs_rq_blocked_load(cfs_rq, 1);
-#ifdef CONFIG_SCHED_HRTICK
-        /*
-         * queued ticks are scheduled to match the slice, so don't bother
-         * validating it and just reschedule.
-         */
-        if (queued) {
-                resched_task(rq_of(cfs_rq)->curr);
-                return;
-        }
-        /*
-         * don't let the period tick interfere with the hrtick preemption
-         */
-        if (!sched_feat(DOUBLE_TICK) &&
-                        hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
-                return;
-#endif
-        if (cfs_rq->nr_running > 1)
-                check_preempt_tick(cfs_rq, curr);
-}
-/**************************************************
- * CFS bandwidth control machinery
- */
-#ifdef CONFIG_CFS_BANDWIDTH
-#ifdef HAVE_JUMP_LABEL
-static struct static_key __cfs_bandwidth_used;
-static inline bool cfs_bandwidth_used(void)
-{
-        return static_key_false(&__cfs_bandwidth_used);
-}
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
-{
-        /* only need to count groups transitioning between enabled/!enabled */
-        if (enabled && !was_enabled)
-                static_key_slow_inc(&__cfs_bandwidth_used);
-        else if (!enabled && was_enabled)
-                static_key_slow_dec(&__cfs_bandwidth_used);
-}
-#else /* HAVE_JUMP_LABEL */
-static bool cfs_bandwidth_used(void)
-{
-        return true;
-}
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
-#endif /* HAVE_JUMP_LABEL */
-/*
- * default period for cfs group bandwidth.
- * default: 0.1s, units: nanoseconds
- */
-static inline u64 default_cfs_period(void)
-{
-        return 100000000ULL;
-}
-static inline u64 sched_cfs_bandwidth_slice(void)
-{
-        return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
-}
-/*
- * Replenish runtime according to assigned quota and update expiration time.
- * We use sched_clock_cpu directly instead of rq->clock to avoid adding
- * additional synchronization around rq->lock.
- *
- * requires cfs_b->lock
- */
-void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
-{
-        u64 now;
-        if (cfs_b->quota == RUNTIME_INF)
-                return;
-        now = sched_clock_cpu(smp_processor_id());
-        cfs_b->runtime = cfs_b->quota;
-        cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
-}
-static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
-{
-        return &tg->cfs_bandwidth;
-}
-/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
-{
-        if (unlikely(cfs_rq->throttle_count))
-                return cfs_rq->throttled_clock_task;
-        return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
-}
-/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-        struct task_group *tg = cfs_rq->tg;
-        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-        u64 amount = 0, min_amount, expires;
-        /* note: this is a positive sum as runtime_remaining <= 0 */
-        min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
-        raw_spin_lock(&cfs_b->lock);
-        if (cfs_b->quota == RUNTIME_INF)
-                amount = min_amount;
-        else {
-                /*
-                 * If the bandwidth pool has become inactive, then at least one
-                 * period must have elapsed since the last consumption.
-                 * Refresh the global state and ensure bandwidth timer becomes
-                 * active.
-                 */
-                if (!cfs_b->timer_active) {
-                        __refill_cfs_bandwidth_runtime(cfs_b);
-                        __start_cfs_bandwidth(cfs_b);
-                }
-                if (cfs_b->runtime > 0) {
-                        amount = min(cfs_b->runtime, min_amount);
-                        cfs_b->runtime -= amount;
-                        cfs_b->idle = 0;
-                }
-        }
-        expires = cfs_b->runtime_expires;
-        raw_spin_unlock(&cfs_b->lock);
-        cfs_rq->runtime_remaining += amount;
-        /*
-         * we may have advanced our local expiration to account for allowed
-         * spread between our sched_clock and the one on which runtime was
-         * issued.
-         */
-        if ((s64)(expires - cfs_rq->runtime_expires) > 0)
-                cfs_rq->runtime_expires = expires;
-        return cfs_rq->runtime_remaining > 0;
-}
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-        struct rq *rq = rq_of(cfs_rq);
-        /* if the deadline is ahead of our clock, nothing to do */
-        if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
-                return;
-        if (cfs_rq->runtime_remaining < 0)
-                return;
-        /*
-         * If the local deadline has passed we have to consider the
-         * possibility that our sched_clock is 'fast' and the global deadline
-         * has not truly expired.
-         *
-         * Fortunately we can check determine whether this the case by checking
-         * whether the global deadline has advanced.
-         */
-        if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
-                /* extend local deadline, drift is bounded above by 2 ticks */
-                cfs_rq->runtime_expires += TICK_NSEC;
-        } else {
-                /* global deadline is ahead, expiration has passed */
-                cfs_rq->runtime_remaining = 0;
-        }
-}
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-                                     unsigned long delta_exec)
-{
-        /* dock delta_exec before expiring quota (as it could span periods) */
-        cfs_rq->runtime_remaining -= delta_exec;
-        expire_cfs_rq_runtime(cfs_rq);
-        if (likely(cfs_rq->runtime_remaining > 0))
-                return;
-        /*
-         * if we're unable to extend our runtime we resched so that the active
-         * hierarchy can be throttled
-         */
-        if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-                resched_task(rq_of(cfs_rq)->curr);
-}
-static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
-{
-        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
-                return;
-        __account_cfs_rq_runtime(cfs_rq, delta_exec);
-}
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
-{
-        return cfs_bandwidth_used() && cfs_rq->throttled;
-}
-/* check whether cfs_rq, or any parent, is throttled */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
-{
-        return cfs_bandwidth_used() && cfs_rq->throttle_count;
-}
-/*
- * Ensure that neither of the group entities corresponding to src_cpu or
- * dest_cpu are members of a throttled hierarchy when performing group
- * load-balance operations.
- */
-static inline int throttled_lb_pair(struct task_group *tg,
-                                    int src_cpu, int dest_cpu)
-{
-        struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
-        src_cfs_rq = tg->cfs_rq[src_cpu];
-        dest_cfs_rq = tg->cfs_rq[dest_cpu];
-        return throttled_hierarchy(src_cfs_rq) ||
-               throttled_hierarchy(dest_cfs_rq);
-}
-/* updated child weight may affect parent so we have to do this bottom up */
-static int tg_unthrottle_up(struct task_group *tg, void *data)
-{
-        struct rq *rq = data;
-        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
-        cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
-        if (!cfs_rq->throttle_count) {
-                /* adjust cfs_rq_clock_task() */
-                cfs_rq->throttled_clock_task_time += rq->clock_task -
-                                             cfs_rq->throttled_clock_task;
-        }
-#endif
-        return 0;
-}
-static int tg_throttle_down(struct task_group *tg, void *data)
-{
-        struct rq *rq = data;
-        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
-        /* group is entering throttled state, stop time */
-        if (!cfs_rq->throttle_count)
-                cfs_rq->throttled_clock_task = rq->clock_task;
-        cfs_rq->throttle_count++;
-        return 0;
-}
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
-{
-        struct rq *rq = rq_of(cfs_rq);
-        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-        struct sched_entity *se;
-        long task_delta, dequeue = 1;
-        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-        /* freeze hierarchy runnable averages while throttled */
-        rcu_read_lock();
-        walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
-        rcu_read_unlock();
-        task_delta = cfs_rq->h_nr_running;
-        for_each_sched_entity(se) {
-                struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-                /* throttled entity or throttle-on-deactivate */
-                if (!se->on_rq)
-                        break;
-                if (dequeue)
-                        dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
-                qcfs_rq->h_nr_running -= task_delta;
-                if (qcfs_rq->load.weight)
-                        dequeue = 0;
-        }
-        if (!se)
-                rq->nr_running -= task_delta;
-        cfs_rq->throttled = 1;
-        cfs_rq->throttled_clock = rq->clock;
-        raw_spin_lock(&cfs_b->lock);
-        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-        raw_spin_unlock(&cfs_b->lock);
-}
-void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
-{
-        struct rq *rq = rq_of(cfs_rq);
-        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-        struct sched_entity *se;
-        int enqueue = 1;
-        long task_delta;
-        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-        cfs_rq->throttled = 0;
-        raw_spin_lock(&cfs_b->lock);
-        cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
-        list_del_rcu(&cfs_rq->throttled_list);
-        raw_spin_unlock(&cfs_b->lock);
-        update_rq_clock(rq);
-        /* update hierarchical throttle state */
-        walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
-        if (!cfs_rq->load.weight)
-                return;
-        task_delta = cfs_rq->h_nr_running;
-        for_each_sched_entity(se) {
-                if (se->on_rq)
-                        enqueue = 0;
-                cfs_rq = cfs_rq_of(se);
-                if (enqueue)
-                        enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
-                cfs_rq->h_nr_running += task_delta;
-                if (cfs_rq_throttled(cfs_rq))
-                        break;
-        }
-        if (!se)
-                rq->nr_running += task_delta;
-        /* determine whether we need to wake up potentially idle cpu */
-        if (rq->curr == rq->idle && rq->cfs.nr_running)
-                resched_task(rq->curr);
-}
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
-                u64 remaining, u64 expires)
-{
-        struct cfs_rq *cfs_rq;
-        u64 runtime = remaining;
-        rcu_read_lock();
-        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
-                                throttled_list) {
-                struct rq *rq = rq_of(cfs_rq);
-                raw_spin_lock(&rq->lock);
-                if (!cfs_rq_throttled(cfs_rq))
-                        goto next;
-                runtime = -cfs_rq->runtime_remaining + 1;
-                if (runtime > remaining)
-                        runtime = remaining;
-                remaining -= runtime;
-                cfs_rq->runtime_remaining += runtime;
-                cfs_rq->runtime_expires = expires;
-                /* we check whether we're throttled above */
-                if (cfs_rq->runtime_remaining > 0)
-                        unthrottle_cfs_rq(cfs_rq);
-next:
-                raw_spin_unlock(&rq->lock);
-                if (!remaining)
-                        break;
-        }
-        rcu_read_unlock();
-        return remaining;
-}
-/*
- * Responsible for refilling a task_group's bandwidth and unthrottling its
- * cfs_rqs as appropriate. If there has been no activity within the last
- * period the timer is deactivated until scheduling resumes; cfs_b->idle is
- * used to track this state.
- */
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
-{
-        u64 runtime, runtime_expires;
-        int idle = 1, throttled;
-        raw_spin_lock(&cfs_b->lock);
-        /* no need to continue the timer with no bandwidth constraint */
-        if (cfs_b->quota == RUNTIME_INF)
-                goto out_unlock;
-        throttled = !list_empty(&cfs_b->throttled_cfs_rq);
-        /* idle depends on !throttled (for the case of a large deficit) */
-        idle = cfs_b->idle && !throttled;
-        cfs_b->nr_periods += overrun;
-        /* if we're going inactive then everything else can be deferred */
-        if (idle)
-                goto out_unlock;
-        __refill_cfs_bandwidth_runtime(cfs_b);
-        if (!throttled) {
-                /* mark as potentially idle for the upcoming period */
-                cfs_b->idle = 1;
-                goto out_unlock;
-        }
-        /* account preceding periods in which throttling occurred */
-        cfs_b->nr_throttled += overrun;
-        /*
-         * There are throttled entities so we must first use the new bandwidth
-         * to unthrottle them before making it generally available.  This
-         * ensures that all existing debts will be paid before a new cfs_rq is
-         * allowed to run.
-         */
-        runtime = cfs_b->runtime;
-        runtime_expires = cfs_b->runtime_expires;
-        cfs_b->runtime = 0;
-        /*
-         * This check is repeated as we are holding onto the new bandwidth
-         * while we unthrottle.  This can potentially race with an unthrottled
-         * group trying to acquire new bandwidth from the global pool.
-         */
-        while (throttled && runtime > 0) {
-                raw_spin_unlock(&cfs_b->lock);
-                /* we can't nest cfs_b->lock while distributing bandwidth */
-                runtime = distribute_cfs_runtime(cfs_b, runtime,
-                                                 runtime_expires);
-                raw_spin_lock(&cfs_b->lock);
-                throttled = !list_empty(&cfs_b->throttled_cfs_rq);
-        }
-        /* return (any) remaining runtime */
-        cfs_b->runtime = runtime;
-        /*
-         * While we are ensured activity in the period following an
-         * unthrottle, this also covers the case in which the new bandwidth is
-         * insufficient to cover the existing bandwidth deficit.  (Forcing the
-         * timer to remain active while there are any throttled entities.)
-         */
-        cfs_b->idle = 0;
-out_unlock:
-        if (idle)
-                cfs_b->timer_active = 0;
-        raw_spin_unlock(&cfs_b->lock);
-        return idle;
-}
-/* a cfs_rq won't donate quota below this amount */
-static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
-/* minimum remaining period time to redistribute slack quota */
-static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
-/* how long we wait to gather additional slack before distributing */
-static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
-/* are we near the end of the current quota period? */
-static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
-{
-        struct hrtimer *refresh_timer = &cfs_b->period_timer;
-        u64 remaining;
-        /* if the call-back is running a quota refresh is already occurring */
-        if (hrtimer_callback_running(refresh_timer))
-                return 1;
-        /* is a quota refresh about to occur? */
-        remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
-        if (remaining < min_expire)
-                return 1;
-        return 0;
-}
-static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
-{
-        u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
-        /* if there's a quota refresh soon don't bother with slack */
-        if (runtime_refresh_within(cfs_b, min_left))
-                return;
-        start_bandwidth_timer(&cfs_b->slack_timer,
-                                ns_to_ktime(cfs_bandwidth_slack_period));
-}
-/* we know any runtime found here is valid as update_curr() precedes return */
-static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-        s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
-        if (slack_runtime <= 0)
-                return;
-        raw_spin_lock(&cfs_b->lock);
-        if (cfs_b->quota != RUNTIME_INF &&
-            cfs_rq->runtime_expires == cfs_b->runtime_expires) {
-                cfs_b->runtime += slack_runtime;
-                /* we are under rq->lock, defer unthrottling using a timer */
-                if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
-                    !list_empty(&cfs_b->throttled_cfs_rq))
-                        start_cfs_slack_bandwidth(cfs_b);
-        }
-        raw_spin_unlock(&cfs_b->lock);
-        /* even if it's not valid for return we don't want to try again */
-        cfs_rq->runtime_remaining -= slack_runtime;
-}
-static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-        if (!cfs_bandwidth_used())
-                return;
-        if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
-                return;
-        __return_cfs_rq_runtime(cfs_rq);
-}
-/*
- * This is done with a timer (instead of inline with bandwidth return) since
- * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
- */
-static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
-{
-        u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
-        u64 expires;
-        /* confirm we're still not at a refresh boundary */
-        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
-                return;
-        raw_spin_lock(&cfs_b->lock);
-        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
-                runtime = cfs_b->runtime;
-                cfs_b->runtime = 0;
-        }
-        expires = cfs_b->runtime_expires;
-        raw_spin_unlock(&cfs_b->lock);
-        if (!runtime)
-                return;
-        runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
-        raw_spin_lock(&cfs_b->lock);
-        if (expires == cfs_b->runtime_expires)
-                cfs_b->runtime = runtime;
-        raw_spin_unlock(&cfs_b->lock);
-}
-/*
- * When a group wakes up we want to make sure that its quota is not already
- * expired/exceeded, otherwise it may be allowed to steal additional ticks of
- * runtime as update_curr() throttling can not not trigger until it's on-rq.
- */
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
-{
-        if (!cfs_bandwidth_used())
-                return;
-        /* an active group must be handled by the update_curr()->put() path */
-        if (!cfs_rq->runtime_enabled || cfs_rq->curr)
-                return;
-        /* ensure the group is not already throttled */
-        if (cfs_rq_throttled(cfs_rq))
-                return;
-        /* update runtime allocation */
-        account_cfs_rq_runtime(cfs_rq, 0);
-        if (cfs_rq->runtime_remaining <= 0)
-                throttle_cfs_rq(cfs_rq);
-}
-/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-        if (!cfs_bandwidth_used())
-                return;
-        if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
-                return;
-        /*
-         * it's possible for a throttled entity to be forced into a running
-         * state (e.g. set_curr_task), in this case we're finished.
-         */
-        if (cfs_rq_throttled(cfs_rq))
-                return;
-        throttle_cfs_rq(cfs_rq);
-}
-static inline u64 default_cfs_period(void);
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
-static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
-static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
-{
-        struct cfs_bandwidth *cfs_b =
-                container_of(timer, struct cfs_bandwidth, slack_timer);
-        do_sched_cfs_slack_timer(cfs_b);
-        return HRTIMER_NORESTART;
-}
-static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
-{
-        struct cfs_bandwidth *cfs_b =
-                container_of(timer, struct cfs_bandwidth, period_timer);
-        ktime_t now;
-        int overrun;
-        int idle = 0;
-        for (;;) {
-                now = hrtimer_cb_get_time(timer);
-                overrun = hrtimer_forward(timer, now, cfs_b->period);
-                if (!overrun)
-                        break;
-                idle = do_sched_cfs_period_timer(cfs_b, overrun);
-        }
-        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
-}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
-        raw_spin_lock_init(&cfs_b->lock);
-        cfs_b->runtime = 0;
-        cfs_b->quota = RUNTIME_INF;
-        cfs_b->period = ns_to_ktime(default_cfs_period());
-        INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
-        hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        cfs_b->period_timer.function = sched_cfs_period_timer;
-        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        cfs_b->slack_timer.function = sched_cfs_slack_timer;
-}
-static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-        cfs_rq->runtime_enabled = 0;
-        INIT_LIST_HEAD(&cfs_rq->throttled_list);
-}
-/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
-        /*
-         * The timer may be active because we're trying to set a new bandwidth
-         * period or because we're racing with the tear-down path
-         * (timer_active==0 becomes visible before the hrtimer call-back
-         * terminates).  In either case we ensure that it's re-programmed
-         */
-        while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
-                raw_spin_unlock(&cfs_b->lock);
-                /* ensure cfs_b->lock is available while we wait */
-                hrtimer_cancel(&cfs_b->period_timer);
-                raw_spin_lock(&cfs_b->lock);
-                /* if someone else restarted the timer then we're done */
-                if (cfs_b->timer_active)
-                        return;
-        }
-        cfs_b->timer_active = 1;
-        start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
-}
-static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
-        hrtimer_cancel(&cfs_b->period_timer);
-        hrtimer_cancel(&cfs_b->slack_timer);
-}
-static void unthrottle_offline_cfs_rqs(struct rq *rq)
-{
-        struct cfs_rq *cfs_rq;
-        for_each_leaf_cfs_rq(rq, cfs_rq) {
-                struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-                if (!cfs_rq->runtime_enabled)
-                        continue;
-                /*
-                 * clock_task is not advancing so we just need to make sure
-                 * there's some valid quota amount
-                 */
-                cfs_rq->runtime_remaining = cfs_b->quota;
-                if (cfs_rq_throttled(cfs_rq))
-                        unthrottle_cfs_rq(cfs_rq);
-        }
-}
-#else /* CONFIG_CFS_BANDWIDTH */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
-{
-        return rq_of(cfs_rq)->clock_task;
-}
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-                                     unsigned long delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
-static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
-{
-        return 0;
-}
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
-{
-        return 0;
-}
-static inline int throttled_lb_pair(struct task_group *tg,
-                                    int src_cpu, int dest_cpu)
-{
-        return 0;
-}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
-#endif
-static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
-{
-        return NULL;
-}
-static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
-#endif /* CONFIG_CFS_BANDWIDTH */
-/**************************************************
- * CFS operations on tasks:
- */
-#ifdef CONFIG_SCHED_HRTICK
-static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
-{
-        struct sched_entity *se = &p->se;
-        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        WARN_ON(task_rq(p) != rq);
-        if (cfs_rq->nr_running > 1) {
-                u64 slice = sched_slice(cfs_rq, se);
-                u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
-                s64 delta = slice - ran;
-                if (delta < 0) {
-                        if (rq->curr == p)
-                                resched_task(p);
-                        return;
-                }
-                /*
-                 * Don't schedule slices shorter than 10000ns, that just
-                 * doesn't make sense. Rely on vruntime for fairness.
-                 */
-                if (rq->curr != p)
-                        delta = max_t(s64, 10000LL, delta);
-                hrtick_start(rq, delta);
-        }
-}
-/*
- * called from enqueue/dequeue and updates the hrtick when the
- * current task is from our class and nr_running is low enough
- * to matter.
- */
-static void hrtick_update(struct rq *rq)
-{
-        struct task_struct *curr = rq->curr;
-        if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
-                return;
-        if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
-                hrtick_start_fair(rq, curr);
-}
-#else /* !CONFIG_SCHED_HRTICK */
-static inline void
-hrtick_start_fair(struct rq *rq, struct task_struct *p)
-{
-}
-static inline void hrtick_update(struct rq *rq)
-{
-}
-#endif
-/*
- * The enqueue_task method is called before nr_running is
- * increased. Here we update the fair scheduling stats and
- * then put the task into the rbtree:
- */
-static void
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
-{
-        struct cfs_rq *cfs_rq;
-        struct sched_entity *se = &p->se;
-        for_each_sched_entity(se) {
-                if (se->on_rq)
-                        break;
-                cfs_rq = cfs_rq_of(se);
-                enqueue_entity(cfs_rq, se, flags);
-                /*
-                 * end evaluation on encountering a throttled cfs_rq
-                 *
-                 * note: in the case of encountering a throttled cfs_rq we will
-                 * post the final h_nr_running increment below.
-                */
-                if (cfs_rq_throttled(cfs_rq))
-                        break;
-                cfs_rq->h_nr_running++;
-                flags = ENQUEUE_WAKEUP;
-        }
-        for_each_sched_entity(se) {
-                cfs_rq = cfs_rq_of(se);
-                cfs_rq->h_nr_running++;
-                if (cfs_rq_throttled(cfs_rq))
-                        break;
-                update_cfs_shares(cfs_rq);
-                update_entity_load_avg(se, 1);
-        }
-        if (!se) {
-                update_rq_runnable_avg(rq, rq->nr_running);
-                inc_nr_running(rq);
-        }
-        hrtick_update(rq);
-}
-static void set_next_buddy(struct sched_entity *se);
-/*
- * The dequeue_task method is called before nr_running is
- * decreased. We remove the task from the rbtree and
- * update the fair scheduling stats:
- */
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
-{
-        struct cfs_rq *cfs_rq;
-        struct sched_entity *se = &p->se;
-        int task_sleep = flags & DEQUEUE_SLEEP;
-        for_each_sched_entity(se) {
-                cfs_rq = cfs_rq_of(se);
-                dequeue_entity(cfs_rq, se, flags);
-                /*
-                 * end evaluation on encountering a throttled cfs_rq
-                 *
-                 * note: in the case of encountering a throttled cfs_rq we will
-                 * post the final h_nr_running decrement below.
-                */
-                if (cfs_rq_throttled(cfs_rq))
-                        break;
-                cfs_rq->h_nr_running--;
-                /* Don't dequeue parent if it has other entities besides us */
-                if (cfs_rq->load.weight) {
-                        /*
-                         * Bias pick_next to pick a task from this cfs_rq, as
-                         * p is sleeping when it is within its sched_slice.
-                         */
-                        if (task_sleep && parent_entity(se))
-                                set_next_buddy(parent_entity(se));
-                        /* avoid re-evaluating load for this entity */
-                        se = parent_entity(se);
-                        break;
-                }
-                flags |= DEQUEUE_SLEEP;
-        }
-        for_each_sched_entity(se) {
-                cfs_rq = cfs_rq_of(se);
-                cfs_rq->h_nr_running--;
-                if (cfs_rq_throttled(cfs_rq))
-                        break;
-                update_cfs_shares(cfs_rq);
-                update_entity_load_avg(se, 1);
-        }
-        if (!se) {
-                dec_nr_running(rq);
-                update_rq_runnable_avg(rq, 1);
-        }
-        hrtick_update(rq);
-}
-#ifdef CONFIG_SMP
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
-        return cpu_rq(cpu)->load.weight;
-}
-/*
- * Return a low guess at the load of a migration-source cpu weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(cpu);
-        if (type == 0 || !sched_feat(LB_BIAS))
-                return total;
-        return min(rq->cpu_load[type-1], total);
-}
-/*
- * Return a high guess at the load of a migration-target cpu weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(cpu);
-        if (type == 0 || !sched_feat(LB_BIAS))
-                return total;
-        return max(rq->cpu_load[type-1], total);
-}
-static unsigned long power_of(int cpu)
-{
-        return cpu_rq(cpu)->cpu_power;
-}
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
-        if (nr_running)
-                return rq->load.weight / nr_running;
-        return 0;
-}
-static void task_waking_fair(struct task_struct *p)
-{
-        struct sched_entity *se = &p->se;
-        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        u64 min_vruntime;
-#ifndef CONFIG_64BIT
-        u64 min_vruntime_copy;
-        do {
-                min_vruntime_copy = cfs_rq->min_vruntime_copy;
-                smp_rmb();
-                min_vruntime = cfs_rq->min_vruntime;
-        } while (min_vruntime != min_vruntime_copy);
-#else
-        min_vruntime = cfs_rq->min_vruntime;
-#endif
-        se->vruntime -= min_vruntime;
-}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * effective_load() calculates the load change as seen from the root_task_group
- *
- * Adding load to a group doesn't make a group heavier, but can cause movement
- * of group shares between cpus. Assuming the shares were perfectly aligned one
- * can calculate the shift in shares.
- *
- * Calculate the effective load difference if @wl is added (subtracted) to @tg
- * on this @cpu and results in a total addition (subtraction) of @wg to the
- * total group weight.
- *
- * Given a runqueue weight distribution (rw_i) we can compute a shares
- * distribution (s_i) using:
- *
- *   s_i = rw_i / \Sum rw_j                                             (1)
- *
- * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
- * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
- * shares distribution (s_i):
- *
- *   rw_i = {   2,   4,   1,   0 }
- *   s_i  = { 2/7, 4/7, 1/7,   0 }
- *
- * As per wake_affine() we're interested in the load of two CPUs (the CPU the
- * task used to run on and the CPU the waker is running on), we need to
- * compute the effect of waking a task on either CPU and, in case of a sync
- * wakeup, compute the effect of the current task going to sleep.
- *
- * So for a change of @wl to the local @cpu with an overall group weight change
- * of @wl we can compute the new shares distribution (s'_i) using:
- *
- *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
- *
- * Suppose we're interested in CPUs 0 and 1, and want to compute the load
- * differences in waking a task to CPU 0. The additional task changes the
- * weight and shares distributions like:
- *
- *   rw'_i = {   3,   4,   1,   0 }
- *   s'_i  = { 3/8, 4/8, 1/8,   0 }
- *
- * We can then compute the difference in effective weight by using:
- *
- *   dw_i = S * (s'_i - s_i)                                            (3)
- *
- * Where 'S' is the group weight as seen by its parent.
- *
- * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
- * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
- * 4/7) times the weight of the group.
- */
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-{
-        struct sched_entity *se = tg->se[cpu];
-        if (!tg->parent)        /* the trivial, non-cgroup case */
-                return wl;
-        for_each_sched_entity(se) {
-                long w, W;
-                tg = se->my_q->tg;
-                /*
-                 * W = @wg + \Sum rw_j
-                 */
-                W = wg + calc_tg_weight(tg, se->my_q);
-                /*
-                 * w = rw_i + @wl
-                 */
-                w = se->my_q->load.weight + wl;
-                /*
-                 * wl = S * s'_i; see (2)
-                 */
-                if (W > 0 && w < W)
-                        wl = (w * tg->shares) / W;
-                else
-                        wl = tg->shares;
-                /*
-                 * Per the above, wl is the new se->load.weight value; since
-                 * those are clipped to [MIN_SHARES, ...) do so now. See
-                 * calc_cfs_shares().
-                 */
-                if (wl < MIN_SHARES)
-                        wl = MIN_SHARES;
-                /*
-                 * wl = dw_i = S * (s'_i - s_i); see (3)
-                 */
-                wl -= se->load.weight;
-                /*
-                 * Recursively apply this logic to all parent groups to compute
-                 * the final effective load change on the root group. Since
-                 * only the @tg group gets extra weight, all parent groups can
-                 * only redistribute existing shares. @wl is the shift in shares
-                 * resulting from this level per the above.
-                 */
-                wg = 0;
-        }
-        return wl;
-}
-#else
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
-                unsigned long wl, unsigned long wg)
-{
-        return wl;
-}
-#endif
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
-{
-        s64 this_load, load;
-        int idx, this_cpu, prev_cpu;
-        unsigned long tl_per_task;
-        struct task_group *tg;
-        unsigned long weight;
-        int balanced;
-        idx       = sd->wake_idx;
-        this_cpu  = smp_processor_id();
-        prev_cpu  = task_cpu(p);
-        load      = source_load(prev_cpu, idx);
-        this_load = target_load(this_cpu, idx);
-        /*
-         * If sync wakeup then subtract the (maximum possible)
-         * effect of the currently running task from the load
-         * of the current CPU:
-         */
-        if (sync) {
-                tg = task_group(current);
-                weight = current->se.load.weight;
-                this_load += effective_load(tg, this_cpu, -weight, -weight);
-                load += effective_load(tg, prev_cpu, 0, -weight);
-        }
-        tg = task_group(p);
-        weight = p->se.load.weight;
-        /*
-         * In low-load situations, where prev_cpu is idle and this_cpu is idle
-         * due to the sync cause above having dropped this_load to 0, we'll
-         * always have an imbalance, but there's really nothing you can do
-         * about that, so that's good too.
-         *
-         * Otherwise check if either cpus are near enough in load to allow this
-         * task to be woken on this_cpu.
-         */
-        if (this_load > 0) {
-                s64 this_eff_load, prev_eff_load;
-                this_eff_load = 100;
-                this_eff_load *= power_of(prev_cpu);
-                this_eff_load *= this_load +
-                        effective_load(tg, this_cpu, weight, weight);
-                prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-                prev_eff_load *= power_of(this_cpu);
-                prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
-                balanced = this_eff_load <= prev_eff_load;
-        } else
-                balanced = true;
-        /*
-         * If the currently running task will sleep within
-         * a reasonable amount of time then attract this newly
-         * woken task:
-         */
-        if (sync && balanced)
-                return 1;
-        schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
-        tl_per_task = cpu_avg_load_per_task(this_cpu);
-        if (balanced ||
-            (this_load <= load &&
-             this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
-                /*
-                 * This domain has SD_WAKE_AFFINE and
-                 * p is cache cold in this domain, and
-                 * there is no bad imbalance.
-                 */
-                schedstat_inc(sd, ttwu_move_affine);
-                schedstat_inc(p, se.statistics.nr_wakeups_affine);
-                return 1;
-        }
-        return 0;
-}
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- */
-static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                  int this_cpu, int load_idx)
-{
-        struct sched_group *idlest = NULL, *group = sd->groups;
-        unsigned long min_load = ULONG_MAX, this_load = 0;
-        int imbalance = 100 + (sd->imbalance_pct-100)/2;
-        do {
-                unsigned long load, avg_load;
-                int local_group;
-                int i;
-                /* Skip over this group if it has no CPUs allowed */
-                if (!cpumask_intersects(sched_group_cpus(group),
-                                        tsk_cpus_allowed(p)))
-                        continue;
-                local_group = cpumask_test_cpu(this_cpu,
-                                               sched_group_cpus(group));
-                /* Tally up the load of all CPUs in the group */
-                avg_load = 0;
-                for_each_cpu(i, sched_group_cpus(group)) {
-                        /* Bias balancing toward cpus of our domain */
-                        if (local_group)
-                                load = source_load(i, load_idx);
-                        else
-                                load = target_load(i, load_idx);
-                        avg_load += load;
-                }
-                /* Adjust by relative CPU power of the group */
-                avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
-                if (local_group) {
-                        this_load = avg_load;
-                } else if (avg_load < min_load) {
-                        min_load = avg_load;
-                        idlest = group;
-                }
-        } while (group = group->next, group != sd->groups);
-        if (!idlest || 100*this_load < imbalance*min_load)
-                return NULL;
-        return idlest;
-}
-/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
- */
-static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
-{
-        unsigned long load, min_load = ULONG_MAX;
-        int idlest = -1;
-        int i;
-        /* Traverse only the allowed CPUs */
-        for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
-                load = weighted_cpuload(i);
-                if (load < min_load || (load == min_load && i == this_cpu)) {
-                        min_load = load;
-                        idlest = i;
-                }
-        }
-        return idlest;
-}
-/*
- * Try and locate an idle CPU in the sched_domain.
- */
-static int select_idle_sibling(struct task_struct *p, int target)
-{
-        int cpu = smp_processor_id();
-        int prev_cpu = task_cpu(p);
-        struct sched_domain *sd;
-        struct sched_group *sg;
-        int i;
-        /*
-         * If the task is going to be woken-up on this cpu and if it is
-         * already idle, then it is the right target.
-         */
-        if (target == cpu && idle_cpu(cpu))
-                return cpu;
-        /*
-         * If the task is going to be woken-up on the cpu where it previously
-         * ran and if it is currently idle, then it the right target.
-         */
-        if (target == prev_cpu && idle_cpu(prev_cpu))
-                return prev_cpu;
-        /*
-         * Otherwise, iterate the domains and find an elegible idle cpu.
-         */
-        sd = rcu_dereference(per_cpu(sd_llc, target));
-        for_each_lower_domain(sd) {
-                sg = sd->groups;
-                do {
-                        if (!cpumask_intersects(sched_group_cpus(sg),
-                                                tsk_cpus_allowed(p)))
-                                goto next;
-                        for_each_cpu(i, sched_group_cpus(sg)) {
-                                if (!idle_cpu(i))
-                                        goto next;
-                        }
-                        target = cpumask_first_and(sched_group_cpus(sg),
-                                        tsk_cpus_allowed(p));
-                        goto done;
-next:
-                        sg = sg->next;
-                } while (sg != sd->groups);
-        }
-done:
-        return target;
-}
-/*
- * sched_balance_self: balance the current task (running on cpu) in domains
- * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
- * SD_BALANCE_EXEC.
- *
- * Balance, ie. select the least loaded group.
- *
- * Returns the target CPU number, or the same CPU if no balancing is needed.
- *
- * preempt must be disabled.
- */
-static int
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
-{
-        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
-        int cpu = smp_processor_id();
-        int prev_cpu = task_cpu(p);
-        int new_cpu = cpu;
-        int want_affine = 0;
-        int sync = wake_flags & WF_SYNC;
-        if (p->nr_cpus_allowed == 1)
-                return prev_cpu;
-        if (sd_flag & SD_BALANCE_WAKE) {
-                if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
-                        want_affine = 1;
-                new_cpu = prev_cpu;
-        }
-        rcu_read_lock();
-        for_each_domain(cpu, tmp) {
-                if (!(tmp->flags & SD_LOAD_BALANCE))
-                        continue;
-                /*
-                 * If both cpu and prev_cpu are part of this domain,
-                 * cpu is a valid SD_WAKE_AFFINE target.
-                 */
-                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
-                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
-                        affine_sd = tmp;
-                        break;
-                }
-                if (tmp->flags & sd_flag)
-                        sd = tmp;
-        }
-        if (affine_sd) {
-                if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-                        prev_cpu = cpu;
-                new_cpu = select_idle_sibling(p, prev_cpu);
-                goto unlock;
-        }
-        while (sd) {
-                int load_idx = sd->forkexec_idx;
-                struct sched_group *group;
-                int weight;
-                if (!(sd->flags & sd_flag)) {
-                        sd = sd->child;
-                        continue;
-                }
-                if (sd_flag & SD_BALANCE_WAKE)
-                        load_idx = sd->wake_idx;
-                group = find_idlest_group(sd, p, cpu, load_idx);
-                if (!group) {
-                        sd = sd->child;
-                        continue;
-                }
-                new_cpu = find_idlest_cpu(group, p, cpu);
-                if (new_cpu == -1 || new_cpu == cpu) {
-                        /* Now try balancing at a lower domain level of cpu */
-                        sd = sd->child;
-                        continue;
-                }
-                /* Now try balancing at a lower domain level of new_cpu */
-                cpu = new_cpu;
-                weight = sd->span_weight;
-                sd = NULL;
-                for_each_domain(cpu, tmp) {
-                        if (weight <= tmp->span_weight)
-                                break;
-                        if (tmp->flags & sd_flag)
-                                sd = tmp;
-                }
-                /* while loop will break here if sd == NULL */
-        }
-unlock:
-        rcu_read_unlock();
-        return new_cpu;
-}
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
- * cfs_rq_of(p) references at time of call are still valid and identify the
- * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
- * other assumptions, including the state of rq->lock, should be made.
- */
-static void
-migrate_task_rq_fair(struct task_struct *p, int next_cpu)
-{
-        struct sched_entity *se = &p->se;
-        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        /*
-         * Load tracking: accumulate removed load so that it can be processed
-         * when we next update owning cfs_rq under rq->lock.  Tasks contribute
-         * to blocked load iff they have a positive decay-count.  It can never
-         * be negative here since on-rq tasks have decay-count == 0.
-         */
-        if (se->avg.decay_count) {
-                se->avg.decay_count = -__synchronize_entity_decay(se);
-                atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
-        }
-}
-#endif
-#endif /* CONFIG_SMP */
-static unsigned long
-wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
-{
-        unsigned long gran = sysctl_sched_wakeup_granularity;
-        /*
-         * Since its curr running now, convert the gran from real-time
-         * to virtual-time in his units.
-         *
-         * By using 'se' instead of 'curr' we penalize light tasks, so
-         * they get preempted easier. That is, if 'se' < 'curr' then
-         * the resulting gran will be larger, therefore penalizing the
-         * lighter, if otoh 'se' > 'curr' then the resulting gran will
-         * be smaller, again penalizing the lighter task.
-         *
-         * This is especially important for buddies when the leftmost
-         * task is higher priority than the buddy.
-         */
-        return calc_delta_fair(gran, se);
-}
-/*
- * Should 'se' preempt 'curr'.
- *
- *             |s1
- *        |s2
- *   |s3
- *         g
- *      |<--->|c
- *
- *  w(c, s1) = -1
- *  w(c, s2) =  0
- *  w(c, s3) =  1
- *
- */
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-{
-        s64 gran, vdiff = curr->vruntime - se->vruntime;
-        if (vdiff <= 0)
-                return -1;
-        gran = wakeup_gran(curr, se);
-        if (vdiff > gran)
-                return 1;
-        return 0;
-}
-static void set_last_buddy(struct sched_entity *se)
-{
-        if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
-                return;
-        for_each_sched_entity(se)
-                cfs_rq_of(se)->last = se;
-}
-static void set_next_buddy(struct sched_entity *se)
-{
-        if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
-                return;
-        for_each_sched_entity(se)
-                cfs_rq_of(se)->next = se;
-}
-static void set_skip_buddy(struct sched_entity *se)
-{
-        for_each_sched_entity(se)
-                cfs_rq_of(se)->skip = se;
-}
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
-{
-        struct task_struct *curr = rq->curr;
-        struct sched_entity *se = &curr->se, *pse = &p->se;
-        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-        int scale = cfs_rq->nr_running >= sched_nr_latency;
-        int next_buddy_marked = 0;
-        if (unlikely(se == pse))
-                return;
-        /*
-         * This is possible from callers such as move_task(), in which we
-         * unconditionally check_prempt_curr() after an enqueue (which may have
-         * lead to a throttle).  This both saves work and prevents false
-         * next-buddy nomination below.
-         */
-        if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
-                return;
-        if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
-                set_next_buddy(pse);
-                next_buddy_marked = 1;
-        }
-        /*
-         * We can come here with TIF_NEED_RESCHED already set from new task
-         * wake up path.
-         *
-         * Note: this also catches the edge-case of curr being in a throttled
-         * group (e.g. via set_curr_task), since update_curr() (in the
-         * enqueue of curr) will have resulted in resched being set.  This
-         * prevents us from potentially nominating it as a false LAST_BUDDY
-         * below.
-         */
-        if (test_tsk_need_resched(curr))
-                return;
-        /* Idle tasks are by definition preempted by non-idle tasks. */
-        if (unlikely(curr->policy == SCHED_IDLE) &&
-            likely(p->policy != SCHED_IDLE))
-                goto preempt;
-        /*
-         * Batch and idle tasks do not preempt non-idle tasks (their preemption
-         * is driven by the tick):
-         */
-        if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
-                return;
-        find_matching_se(&se, &pse);
-        update_curr(cfs_rq_of(se));
-        BUG_ON(!pse);
-        if (wakeup_preempt_entity(se, pse) == 1) {
-                /*
-                 * Bias pick_next to pick the sched entity that is
-                 * triggering this preemption.
-                 */
-                if (!next_buddy_marked)
-                        set_next_buddy(pse);
-                goto preempt;
-        }
-        return;
-preempt:
-        resched_task(curr);
-        /*
-         * Only set the backward buddy when the current task is still
-         * on the rq. This can happen when a wakeup gets interleaved
-         * with schedule on the ->pre_schedule() or idle_balance()
-         * point, either of which can * drop the rq lock.
-         *
-         * Also, during early boot the idle thread is in the fair class,
-         * for obvious reasons its a bad idea to schedule back to it.
-         */
-        if (unlikely(!se->on_rq || curr == rq->idle))
-                return;
-        if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
-                set_last_buddy(se);
-}
-static struct task_struct *pick_next_task_fair(struct rq *rq)
-{
-        struct task_struct *p;
-        struct cfs_rq *cfs_rq = &rq->cfs;
-        struct sched_entity *se;
-        if (!cfs_rq->nr_running)
-                return NULL;
-        do {
-                se = pick_next_entity(cfs_rq);
-                set_next_entity(cfs_rq, se);
-                cfs_rq = group_cfs_rq(se);
-        } while (cfs_rq);
-        p = task_of(se);
-        if (hrtick_enabled(rq))
-                hrtick_start_fair(rq, p);
-        return p;
-}
-/*
- * Account for a descheduled task:
- */
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
-{
-        struct sched_entity *se = &prev->se;
-        struct cfs_rq *cfs_rq;
-        for_each_sched_entity(se) {
-                cfs_rq = cfs_rq_of(se);
-                put_prev_entity(cfs_rq, se);
-        }
-}
-/*
- * sched_yield() is very simple
- *
- * The magic of dealing with the ->skip buddy is in pick_next_entity.
- */
-static void yield_task_fair(struct rq *rq)
-{
-        struct task_struct *curr = rq->curr;
-        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-        struct sched_entity *se = &curr->se;
-        /*
-         * Are we the only task in the tree?
-         */
-        if (unlikely(rq->nr_running == 1))
-                return;
-        clear_buddies(cfs_rq, se);
-        if (curr->policy != SCHED_BATCH) {
-                update_rq_clock(rq);
-                /*
-                 * Update run-time statistics of the 'current'.
-                 */
-                update_curr(cfs_rq);
-                /*
-                 * Tell update_rq_clock() that we've just updated,
-                 * so we don't do microscopic update in schedule()
-                 * and double the fastpath cost.
-                 */
-                 rq->skip_clock_update = 1;
-        }
-        set_skip_buddy(se);
-}
-static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
-{
-        struct sched_entity *se = &p->se;
-        /* throttled hierarchies are not runnable */
-        if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
-                return false;
-        /* Tell the scheduler that we'd really like pse to run next. */
-        set_next_buddy(se);
-        yield_task_fair(rq);
-        return true;
-}
-#ifdef CONFIG_SMP
-/**************************************************
- * Fair scheduling class load-balancing methods.
- *
- * BASICS
- *
- * The purpose of load-balancing is to achieve the same basic fairness the
- * per-cpu scheduler provides, namely provide a proportional amount of compute
- * time to each task. This is expressed in the following equation:
- *
- *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
- *
- * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
- * W_i,0 is defined as:
- *
- *   W_i,0 = \Sum_j w_i,j                                             (2)
- *
- * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
- * is derived from the nice value as per prio_to_weight[].
- *
- * The weight average is an exponential decay average of the instantaneous
- * weight:
- *
- *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
- *
- * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
- * fraction of 'recent' time available for SCHED_OTHER task execution. But it
- * can also include other factors [XXX].
- *
- * To achieve this balance we define a measure of imbalance which follows
- * directly from (1):
- *
- *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
- *
- * We them move tasks around to minimize the imbalance. In the continuous
- * function space it is obvious this converges, in the discrete case we get
- * a few fun cases generally called infeasible weight scenarios.
- *
- * [XXX expand on:
- *     - infeasible weights;
- *     - local vs global optima in the discrete case. ]
- *
- *
- * SCHED DOMAINS
- *
- * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
- * for all i,j solution, we create a tree of cpus that follows the hardware
- * topology where each level pairs two lower groups (or better). This results
- * in O(log n) layers. Furthermore we reduce the number of cpus going up the
- * tree to only the first of the previous level and we decrease the frequency
- * of load-balance at each level inv. proportional to the number of cpus in
- * the groups.
- *
- * This yields:
- *
- *     log_2 n     1     n
- *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
- *     i = 0      2^i   2^i
- *                               `- size of each group
- *         |         |     `- number of cpus doing load-balance
- *         |         `- freq
- *         `- sum over all levels
- *
- * Coupled with a limit on how many tasks we can migrate every balance pass,
- * this makes (5) the runtime complexity of the balancer.
- *
- * An important property here is that each CPU is still (indirectly) connected
- * to every other cpu in at most O(log n) steps:
- *
- * The adjacency matrix of the resulting graph is given by:
- *
- *             log_2 n     
- *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
- *             k = 0
- *
- * And you'll find that:
- *
- *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
- *
- * Showing there's indeed a path between every cpu in at most O(log n) steps.
- * The task movement gives a factor of O(m), giving a convergence complexity
- * of:
- *
- *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
- *
- *
- * WORK CONSERVING
- *
- * In order to avoid CPUs going idle while there's still work to do, new idle
- * balancing is more aggressive and has the newly idle cpu iterate up the domain
- * tree itself instead of relying on other CPUs to bring it work.
- *
- * This adds some complexity to both (5) and (8) but it reduces the total idle
- * time.
- *
- * [XXX more?]
- *
- *
- * CGROUPS
- *
- * Cgroups make a horror show out of (2), instead of a simple sum we get:
- *
- *                                s_k,i
- *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
- *                                 S_k
- *
- * Where
- *
- *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
- *
- * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
- *
- * The big problem is S_k, its a global sum needed to compute a local (W_i)
- * property.
- *
- * [XXX write more on how we solve this.. _after_ merging pjt's patches that
- *      rewrite all of this once again.]
- */ 
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-#define LBF_ALL_PINNED  0x01
-#define LBF_NEED_BREAK  0x02
-#define LBF_SOME_PINNED 0x04
-struct lb_env {
-        struct sched_domain     *sd;
-        struct rq               *src_rq;
-        int                     src_cpu;
-        int                     dst_cpu;
-        struct rq               *dst_rq;
-        struct cpumask          *dst_grpmask;
-        int                     new_dst_cpu;
-        enum cpu_idle_type      idle;
-        long                    imbalance;
-        /* The set of CPUs under consideration for load-balancing */
-        struct cpumask          *cpus;
-        unsigned int            flags;
-        unsigned int            loop;
-        unsigned int            loop_break;
-        unsigned int            loop_max;
-};
-/*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
- */
-static void move_task(struct task_struct *p, struct lb_env *env)
-{
-        deactivate_task(env->src_rq, p, 0);
-        set_task_cpu(p, env->dst_cpu);
-        activate_task(env->dst_rq, p, 0);
-        check_preempt_curr(env->dst_rq, p, 0);
-}
-/*
- * Is this task likely cache-hot:
- */
-static int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
-{
-        s64 delta;
-        if (p->sched_class != &fair_sched_class)
-                return 0;
-        if (unlikely(p->policy == SCHED_IDLE))
-                return 0;
-        /*
-         * Buddy candidates are cache hot:
-         */
-        if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
-                        (&p->se == cfs_rq_of(&p->se)->next ||
-                         &p->se == cfs_rq_of(&p->se)->last))
-                return 1;
-        if (sysctl_sched_migration_cost == -1)
-                return 1;
-        if (sysctl_sched_migration_cost == 0)
-                return 0;
-        delta = now - p->se.exec_start;
-        return delta < (s64)sysctl_sched_migration_cost;
-}
-/*
- * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
- */
-static
-int can_migrate_task(struct task_struct *p, struct lb_env *env)
-{
-        int tsk_cache_hot = 0;
-        /*
-         * We do not migrate tasks that are:
-         * 1) running (obviously), or
-         * 2) cannot be migrated to this CPU due to cpus_allowed, or
-         * 3) are cache-hot on their current CPU.
-         */
-        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
-                int new_dst_cpu;
-                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
-                /*
-                 * Remember if this task can be migrated to any other cpu in
-                 * our sched_group. We may want to revisit it if we couldn't
-                 * meet load balance goals by pulling other tasks on src_cpu.
-                 *
-                 * Also avoid computing new_dst_cpu if we have already computed
-                 * one in current iteration.
-                 */
-                if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
-                        return 0;
-                new_dst_cpu = cpumask_first_and(env->dst_grpmask,
-                                                tsk_cpus_allowed(p));
-                if (new_dst_cpu < nr_cpu_ids) {
-                        env->flags |= LBF_SOME_PINNED;
-                        env->new_dst_cpu = new_dst_cpu;
-                }
-                return 0;
-        }
-        /* Record that we found atleast one task that could run on dst_cpu */
-        env->flags &= ~LBF_ALL_PINNED;
-        if (task_running(env->src_rq, p)) {
-                schedstat_inc(p, se.statistics.nr_failed_migrations_running);
-                return 0;
-        }
-        /*
-         * Aggressive migration if:
-         * 1) task is cache cold, or
-         * 2) too many balance attempts have failed.
-         */
-        tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
-        if (!tsk_cache_hot ||
-                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
-                if (tsk_cache_hot) {
-                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
-                        schedstat_inc(p, se.statistics.nr_forced_migrations);
-                }
-#endif
-                return 1;
-        }
-        if (tsk_cache_hot) {
-                schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
-                return 0;
-        }
-        return 1;
-}
-/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
- * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_one_task(struct lb_env *env)
-{
-        struct task_struct *p, *n;
-        list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
-                if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
-                        continue;
-                if (!can_migrate_task(p, env))
-                        continue;
-                move_task(p, env);
-                /*
-                 * Right now, this is only the second place move_task()
-                 * is called, so we can safely collect move_task()
-                 * stats here rather than inside move_task().
-                 */
-                schedstat_inc(env->sd, lb_gained[env->idle]);
-                return 1;
-        }
-        return 0;
-}
-static unsigned long task_h_load(struct task_struct *p);
-static const unsigned int sched_nr_migrate_break = 32;
-/*
- * move_tasks tries to move up to imbalance weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct lb_env *env)
-{
-        struct list_head *tasks = &env->src_rq->cfs_tasks;
-        struct task_struct *p;
-        unsigned long load;
-        int pulled = 0;
-        if (env->imbalance <= 0)
-                return 0;
-        while (!list_empty(tasks)) {
-                p = list_first_entry(tasks, struct task_struct, se.group_node);
-                env->loop++;
-                /* We've more or less seen every task there is, call it quits */
-                if (env->loop > env->loop_max)
-                        break;
-                /* take a breather every nr_migrate tasks */
-                if (env->loop > env->loop_break) {
-                        env->loop_break += sched_nr_migrate_break;
-                        env->flags |= LBF_NEED_BREAK;
-                        break;
-                }
-                if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
-                        goto next;
-                load = task_h_load(p);
-                if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
-                        goto next;
-                if ((load / 2) > env->imbalance)
-                        goto next;
-                if (!can_migrate_task(p, env))
-                        goto next;
-                move_task(p, env);
-                pulled++;
-                env->imbalance -= load;
-#ifdef CONFIG_PREEMPT
-                /*
-                 * NEWIDLE balancing is a source of latency, so preemptible
-                 * kernels will stop after the first task is pulled to minimize
-                 * the critical section.
-                 */
-                if (env->idle == CPU_NEWLY_IDLE)
-                        break;
-#endif
-                /*
-                 * We only want to steal up to the prescribed amount of
-                 * weighted load.
-                 */
-                if (env->imbalance <= 0)
-                        break;
-                continue;
-next:
-                list_move_tail(&p->se.group_node, tasks);
-        }
-        /*
-         * Right now, this is one of only two places move_task() is called,
-         * so we can safely collect move_task() stats here rather than
-         * inside move_task().
-         */
-        schedstat_add(env->sd, lb_gained[env->idle], pulled);
-        return pulled;
-}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * update tg->load_weight by folding this cpu's load_avg
- */
-static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
-{
-        struct sched_entity *se = tg->se[cpu];
-        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
-        /* throttled entities do not contribute to load */
-        if (throttled_hierarchy(cfs_rq))
-                return;
-        update_cfs_rq_blocked_load(cfs_rq, 1);
-        if (se) {
-                update_entity_load_avg(se, 1);
-                /*
-                 * We pivot on our runnable average having decayed to zero for
-                 * list removal.  This generally implies that all our children
-                 * have also been removed (modulo rounding error or bandwidth
-                 * control); however, such cases are rare and we can fix these
-                 * at enqueue.
-                 *
-                 * TODO: fix up out-of-order children on enqueue.
-                 */
-                if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
-                        list_del_leaf_cfs_rq(cfs_rq);
-        } else {
-                struct rq *rq = rq_of(cfs_rq);
-                update_rq_runnable_avg(rq, rq->nr_running);
-        }
-}
-static void update_blocked_averages(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        struct cfs_rq *cfs_rq;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        update_rq_clock(rq);
-        /*
-         * Iterates the task_group tree in a bottom up fashion, see
-         * list_add_leaf_cfs_rq() for details.
-         */
-        for_each_leaf_cfs_rq(rq, cfs_rq) {
-                /*
-                 * Note: We may want to consider periodically releasing
-                 * rq->lock about these updates so that creating many task
-                 * groups does not result in continually extending hold time.
-                 */
-                __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
-        }
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
-        unsigned long load;
-        long cpu = (long)data;
-        if (!tg->parent) {
-                load = cpu_rq(cpu)->load.weight;
-        } else {
-                load = tg->parent->cfs_rq[cpu]->h_load;
-                load *= tg->se[cpu]->load.weight;
-                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
-        }
-        tg->cfs_rq[cpu]->h_load = load;
-        return 0;
-}
-static void update_h_load(long cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long now = jiffies;
-        if (rq->h_load_throttle == now)
-                return;
-        rq->h_load_throttle = now;
-        rcu_read_lock();
-        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-        rcu_read_unlock();
-}
-static unsigned long task_h_load(struct task_struct *p)
-{
-        struct cfs_rq *cfs_rq = task_cfs_rq(p);
-        unsigned long load;
-        load = p->se.load.weight;
-        load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
-        return load;
-}
-#else
-static inline void update_blocked_averages(int cpu)
-{
-}
-static inline void update_h_load(long cpu)
-{
-}
-static unsigned long task_h_load(struct task_struct *p)
-{
-        return p->se.load.weight;
-}
-#endif
-/********** Helpers for find_busiest_group ************************/
-/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- *              during load balancing.
- */
-struct sd_lb_stats {
-        struct sched_group *busiest; /* Busiest group in this sd */
-        struct sched_group *this;  /* Local group in this sd */
-        unsigned long total_load;  /* Total load of all groups in sd */
-        unsigned long total_pwr;   /*   Total power of all groups in sd */
-        unsigned long avg_load;    /* Average load across all groups in sd */
-        /** Statistics of this group */
-        unsigned long this_load;
-        unsigned long this_load_per_task;
-        unsigned long this_nr_running;
-        unsigned long this_has_capacity;
-        unsigned int  this_idle_cpus;
-        /* Statistics of the busiest group */
-        unsigned int  busiest_idle_cpus;
-        unsigned long max_load;
-        unsigned long busiest_load_per_task;
-        unsigned long busiest_nr_running;
-        unsigned long busiest_group_capacity;
-        unsigned long busiest_has_capacity;
-        unsigned int  busiest_group_weight;
-        int group_imb; /* Is there imbalance in this sd */
-};
-/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
- */
-struct sg_lb_stats {
-        unsigned long avg_load; /*Avg load across the CPUs of the group */
-        unsigned long group_load; /* Total load over the CPUs of the group */
-        unsigned long sum_nr_running; /* Nr tasks running in the group */
-        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
-        unsigned long group_capacity;
-        unsigned long idle_cpus;
-        unsigned long group_weight;
-        int group_imb; /* Is there an imbalance in the group ? */
-        int group_has_capacity; /* Is there extra capacity in the group? */
-};
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
-                                        enum cpu_idle_type idle)
-{
-        int load_idx;
-        switch (idle) {
-        case CPU_NOT_IDLE:
-                load_idx = sd->busy_idx;
-                break;
-        case CPU_NEWLY_IDLE:
-                load_idx = sd->newidle_idx;
-                break;
-        default:
-                load_idx = sd->idle_idx;
-                break;
-        }
-        return load_idx;
-}
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-        return SCHED_POWER_SCALE;
-}
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-        return default_scale_freq_power(sd, cpu);
-}
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-        unsigned long weight = sd->span_weight;
-        unsigned long smt_gain = sd->smt_gain;
-        smt_gain /= weight;
-        return smt_gain;
-}
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-        return default_scale_smt_power(sd, cpu);
-}
-unsigned long scale_rt_power(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        u64 total, available, age_stamp, avg;
-        /*
-         * Since we're reading these variables without serialization make sure
-         * we read them once before doing sanity checks on them.
-         */
-        age_stamp = ACCESS_ONCE(rq->age_stamp);
-        avg = ACCESS_ONCE(rq->rt_avg);
-        total = sched_avg_period() + (rq->clock - age_stamp);
-        if (unlikely(total < avg)) {
-                /* Ensures that power won't end up being negative */
-                available = 0;
-        } else {
-                available = total - avg;
-        }
-        if (unlikely((s64)total < SCHED_POWER_SCALE))
-                total = SCHED_POWER_SCALE;
-        total >>= SCHED_POWER_SHIFT;
-        return div_u64(available, total);
-}
-static void update_cpu_power(struct sched_domain *sd, int cpu)
-{
-        unsigned long weight = sd->span_weight;
-        unsigned long power = SCHED_POWER_SCALE;
-        struct sched_group *sdg = sd->groups;
-        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-                if (sched_feat(ARCH_POWER))
-                        power *= arch_scale_smt_power(sd, cpu);
-                else
-                        power *= default_scale_smt_power(sd, cpu);
-                power >>= SCHED_POWER_SHIFT;
-        }
-        sdg->sgp->power_orig = power;
-        if (sched_feat(ARCH_POWER))
-                power *= arch_scale_freq_power(sd, cpu);
-        else
-                power *= default_scale_freq_power(sd, cpu);
-        power >>= SCHED_POWER_SHIFT;
-        power *= scale_rt_power(cpu);
-        power >>= SCHED_POWER_SHIFT;
-        if (!power)
-                power = 1;
-        cpu_rq(cpu)->cpu_power = power;
-        sdg->sgp->power = power;
-}
-void update_group_power(struct sched_domain *sd, int cpu)
-{
-        struct sched_domain *child = sd->child;
-        struct sched_group *group, *sdg = sd->groups;
-        unsigned long power;
-        unsigned long interval;
-        interval = msecs_to_jiffies(sd->balance_interval);
-        interval = clamp(interval, 1UL, max_load_balance_interval);
-        sdg->sgp->next_update = jiffies + interval;
-        if (!child) {
-                update_cpu_power(sd, cpu);
-                return;
-        }
-        power = 0;
-        if (child->flags & SD_OVERLAP) {
-                /*
-                 * SD_OVERLAP domains cannot assume that child groups
-                 * span the current group.
-                 */
-                for_each_cpu(cpu, sched_group_cpus(sdg))
-                        power += power_of(cpu);
-        } else  {
-                /*
-                 * !SD_OVERLAP domains can assume that child groups
-                 * span the current group.
-                 */ 
-                group = child->groups;
-                do {
-                        power += group->sgp->power;
-                        group = group->next;
-                } while (group != child->groups);
-        }
-        sdg->sgp->power_orig = sdg->sgp->power = power;
-}
-/*
- * Try and fix up capacity for tiny siblings, this is needed when
- * things like SD_ASYM_PACKING need f_b_g to select another sibling
- * which on its own isn't powerful enough.
- *
- * See update_sd_pick_busiest() and check_asym_packing().
- */
-static inline int
-fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
-{
-        /*
-         * Only siblings can have significantly less than SCHED_POWER_SCALE
-         */
-        if (!(sd->flags & SD_SHARE_CPUPOWER))
-                return 0;
-        /*
-         * If ~90% of the cpu_power is still there, we're good.
-         */
-        if (group->sgp->power * 32 > group->sgp->power_orig * 29)
-                return 1;
-        return 0;
-}
-/**
- * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @env: The load balancing environment.
- * @group: sched_group whose statistics are to be updated.
- * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @local_group: Does group contain this_cpu.
- * @balance: Should we balance.
- * @sgs: variable to hold the statistics for this group.
- */
-static inline void update_sg_lb_stats(struct lb_env *env,
-                        struct sched_group *group, int load_idx,
-                        int local_group, int *balance, struct sg_lb_stats *sgs)
-{
-        unsigned long nr_running, max_nr_running, min_nr_running;
-        unsigned long load, max_cpu_load, min_cpu_load;
-        unsigned int balance_cpu = -1, first_idle_cpu = 0;
-        unsigned long avg_load_per_task = 0;
-        int i;
-        if (local_group)
-                balance_cpu = group_balance_cpu(group);
-        /* Tally up the load of all CPUs in the group */
-        max_cpu_load = 0;
-        min_cpu_load = ~0UL;
-        max_nr_running = 0;
-        min_nr_running = ~0UL;
-        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-                struct rq *rq = cpu_rq(i);
-                nr_running = rq->nr_running;
-                /* Bias balancing toward cpus of our domain */
-                if (local_group) {
-                        if (idle_cpu(i) && !first_idle_cpu &&
-                                        cpumask_test_cpu(i, sched_group_mask(group))) {
-                                first_idle_cpu = 1;
-                                balance_cpu = i;
-                        }
-                        load = target_load(i, load_idx);
-                } else {
-                        load = source_load(i, load_idx);
-                        if (load > max_cpu_load)
-                                max_cpu_load = load;
-                        if (min_cpu_load > load)
-                                min_cpu_load = load;
-                        if (nr_running > max_nr_running)
-                                max_nr_running = nr_running;
-                        if (min_nr_running > nr_running)
-                                min_nr_running = nr_running;
-                }
-                sgs->group_load += load;
-                sgs->sum_nr_running += nr_running;
-                sgs->sum_weighted_load += weighted_cpuload(i);
-                if (idle_cpu(i))
-                        sgs->idle_cpus++;
-        }
-        /*
-         * First idle cpu or the first cpu(busiest) in this sched group
-         * is eligible for doing load balancing at this and above
-         * domains. In the newly idle case, we will allow all the cpu's
-         * to do the newly idle load balance.
-         */
-        if (local_group) {
-                if (env->idle != CPU_NEWLY_IDLE) {
-                        if (balance_cpu != env->dst_cpu) {
-                                *balance = 0;
-                                return;
-                        }
-                        update_group_power(env->sd, env->dst_cpu);
-                } else if (time_after_eq(jiffies, group->sgp->next_update))
-                        update_group_power(env->sd, env->dst_cpu);
-        }
-        /* Adjust by relative CPU power of the group */
-        sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
-        /*
-         * Consider the group unbalanced when the imbalance is larger
-         * than the average weight of a task.
-         *
-         * APZ: with cgroup the avg task weight can vary wildly and
-         *      might not be a suitable number - should we keep a
-         *      normalized nr_running number somewhere that negates
-         *      the hierarchy?
-         */
-        if (sgs->sum_nr_running)
-                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-        if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
-            (max_nr_running - min_nr_running) > 1)
-                sgs->group_imb = 1;
-        sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
-                                                SCHED_POWER_SCALE);
-        if (!sgs->group_capacity)
-                sgs->group_capacity = fix_small_capacity(env->sd, group);
-        sgs->group_weight = group->group_weight;
-        if (sgs->group_capacity > sgs->sum_nr_running)
-                sgs->group_has_capacity = 1;
-}
-/**
- * update_sd_pick_busiest - return 1 on busiest group
- * @env: The load balancing environment.
- * @sds: sched_domain statistics
- * @sg: sched_group candidate to be checked for being the busiest
- * @sgs: sched_group statistics
- *
- * Determine if @sg is a busier group than the previously selected
- * busiest group.
- */
-static bool update_sd_pick_busiest(struct lb_env *env,
-                                   struct sd_lb_stats *sds,
-                                   struct sched_group *sg,
-                                   struct sg_lb_stats *sgs)
-{
-        if (sgs->avg_load <= sds->max_load)
-                return false;
-        if (sgs->sum_nr_running > sgs->group_capacity)
-                return true;
-        if (sgs->group_imb)
-                return true;
-        /*
-         * ASYM_PACKING needs to move all the work to the lowest
-         * numbered CPUs in the group, therefore mark all groups
-         * higher than ourself as busy.
-         */
-        if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
-            env->dst_cpu < group_first_cpu(sg)) {
-                if (!sds->busiest)
-                        return true;
-                if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
-                        return true;
-        }
-        return false;
-}
-/**
- * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
- * @env: The load balancing environment.
- * @balance: Should we balance.
- * @sds: variable to hold the statistics for this sched_domain.
- */
-static inline void update_sd_lb_stats(struct lb_env *env,
-                                        int *balance, struct sd_lb_stats *sds)
-{
-        struct sched_domain *child = env->sd->child;
-        struct sched_group *sg = env->sd->groups;
-        struct sg_lb_stats sgs;
-        int load_idx, prefer_sibling = 0;
-        if (child && child->flags & SD_PREFER_SIBLING)
-                prefer_sibling = 1;
-        load_idx = get_sd_load_idx(env->sd, env->idle);
-        do {
-                int local_group;
-                local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
-                memset(&sgs, 0, sizeof(sgs));
-                update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
-                if (local_group && !(*balance))
-                        return;
-                sds->total_load += sgs.group_load;
-                sds->total_pwr += sg->sgp->power;
-                /*
-                 * In case the child domain prefers tasks go to siblings
-                 * first, lower the sg capacity to one so that we'll try
-                 * and move all the excess tasks away. We lower the capacity
-                 * of a group only if the local group has the capacity to fit
-                 * these excess tasks, i.e. nr_running < group_capacity. The
-                 * extra check prevents the case where you always pull from the
-                 * heaviest group when it is already under-utilized (possible
-                 * with a large weight task outweighs the tasks on the system).
-                 */
-                if (prefer_sibling && !local_group && sds->this_has_capacity)
-                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
-                if (local_group) {
-                        sds->this_load = sgs.avg_load;
-                        sds->this = sg;
-                        sds->this_nr_running = sgs.sum_nr_running;
-                        sds->this_load_per_task = sgs.sum_weighted_load;
-                        sds->this_has_capacity = sgs.group_has_capacity;
-                        sds->this_idle_cpus = sgs.idle_cpus;
-                } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
-                        sds->max_load = sgs.avg_load;
-                        sds->busiest = sg;
-                        sds->busiest_nr_running = sgs.sum_nr_running;
-                        sds->busiest_idle_cpus = sgs.idle_cpus;
-                        sds->busiest_group_capacity = sgs.group_capacity;
-                        sds->busiest_load_per_task = sgs.sum_weighted_load;
-                        sds->busiest_has_capacity = sgs.group_has_capacity;
-                        sds->busiest_group_weight = sgs.group_weight;
-                        sds->group_imb = sgs.group_imb;
-                }
-                sg = sg->next;
-        } while (sg != env->sd->groups);
-}
-/**
- * check_asym_packing - Check to see if the group is packed into the
- *                      sched doman.
- *
- * This is primarily intended to used at the sibling level.  Some
- * cores like POWER7 prefer to use lower numbered SMT threads.  In the
- * case of POWER7, it can move to lower SMT modes only when higher
- * threads are idle.  When in lower SMT modes, the threads will
- * perform better since they share less core resources.  Hence when we
- * have idle threads, we want them to be the higher ones.
- *
- * This packing function is run on idle threads.  It checks to see if
- * the busiest CPU in this domain (core in the P7 case) has a higher
- * CPU number than the packing function is being run on.  Here we are
- * assuming lower CPU number will be equivalent to lower a SMT thread
- * number.
- *
- * Returns 1 when packing is required and a task should be moved to
- * this CPU.  The amount of the imbalance is returned in *imbalance.
- *
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain which is to be packed
- */
-static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
-{
-        int busiest_cpu;
-        if (!(env->sd->flags & SD_ASYM_PACKING))
-                return 0;
-        if (!sds->busiest)
-                return 0;
-        busiest_cpu = group_first_cpu(sds->busiest);
-        if (env->dst_cpu > busiest_cpu)
-                return 0;
-        env->imbalance = DIV_ROUND_CLOSEST(
-                sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
-        return 1;
-}
-/**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- *                      amongst the groups of a sched_domain, during
- *                      load balancing.
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- */
-static inline
-void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
-{
-        unsigned long tmp, pwr_now = 0, pwr_move = 0;
-        unsigned int imbn = 2;
-        unsigned long scaled_busy_load_per_task;
-        if (sds->this_nr_running) {
-                sds->this_load_per_task /= sds->this_nr_running;
-                if (sds->busiest_load_per_task >
-                                sds->this_load_per_task)
-                        imbn = 1;
-        } else {
-                sds->this_load_per_task =
-                        cpu_avg_load_per_task(env->dst_cpu);
-        }
-        scaled_busy_load_per_task = sds->busiest_load_per_task
-                                         * SCHED_POWER_SCALE;
-        scaled_busy_load_per_task /= sds->busiest->sgp->power;
-        if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
-                        (scaled_busy_load_per_task * imbn)) {
-                env->imbalance = sds->busiest_load_per_task;
-                return;
-        }
-        /*
-         * OK, we don't have enough imbalance to justify moving tasks,
-         * however we may be able to increase total CPU power used by
-         * moving them.
-         */
-        pwr_now += sds->busiest->sgp->power *
-                        min(sds->busiest_load_per_task, sds->max_load);
-        pwr_now += sds->this->sgp->power *
-                        min(sds->this_load_per_task, sds->this_load);
-        pwr_now /= SCHED_POWER_SCALE;
-        /* Amount of load we'd subtract */
-        tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-                sds->busiest->sgp->power;
-        if (sds->max_load > tmp)
-                pwr_move += sds->busiest->sgp->power *
-                        min(sds->busiest_load_per_task, sds->max_load - tmp);
-        /* Amount of load we'd add */
-        if (sds->max_load * sds->busiest->sgp->power <
-                sds->busiest_load_per_task * SCHED_POWER_SCALE)
-                tmp = (sds->max_load * sds->busiest->sgp->power) /
-                        sds->this->sgp->power;
-        else
-                tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-                        sds->this->sgp->power;
-        pwr_move += sds->this->sgp->power *
-                        min(sds->this_load_per_task, sds->this_load + tmp);
-        pwr_move /= SCHED_POWER_SCALE;
-        /* Move if we gain throughput */
-        if (pwr_move > pwr_now)
-                env->imbalance = sds->busiest_load_per_task;
-}
-/**
- * calculate_imbalance - Calculate the amount of imbalance present within the
- *                       groups of a given sched_domain during load balance.
- * @env: load balance environment
- * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- */
-static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
-{
-        unsigned long max_pull, load_above_capacity = ~0UL;
-        sds->busiest_load_per_task /= sds->busiest_nr_running;
-        if (sds->group_imb) {
-                sds->busiest_load_per_task =
-                        min(sds->busiest_load_per_task, sds->avg_load);
-        }
-        /*
-         * In the presence of smp nice balancing, certain scenarios can have
-         * max load less than avg load(as we skip the groups at or below
-         * its cpu_power, while calculating max_load..)
-         */
-        if (sds->max_load < sds->avg_load) {
-                env->imbalance = 0;
-                return fix_small_imbalance(env, sds);
-        }
-        if (!sds->group_imb) {
-                /*
-                 * Don't want to pull so many tasks that a group would go idle.
-                 */
-                load_above_capacity = (sds->busiest_nr_running -
-                                                sds->busiest_group_capacity);
-                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
-                load_above_capacity /= sds->busiest->sgp->power;
-        }
-        /*
-         * We're trying to get all the cpus to the average_load, so we don't
-         * want to push ourselves above the average load, nor do we wish to
-         * reduce the max loaded cpu below the average load. At the same time,
-         * we also don't want to reduce the group load below the group capacity
-         * (so that we can implement power-savings policies etc). Thus we look
-         * for the minimum possible imbalance.
-         * Be careful of negative numbers as they'll appear as very large values
-         * with unsigned longs.
-         */
-        max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
-        /* How much load to actually move to equalise the imbalance */
-        env->imbalance = min(max_pull * sds->busiest->sgp->power,
-                (sds->avg_load - sds->this_load) * sds->this->sgp->power)
-                        / SCHED_POWER_SCALE;
-        /*
-         * if *imbalance is less than the average load per runnable task
-         * there is no guarantee that any tasks will be moved so we'll have
-         * a think about bumping its value to force at least one task to be
-         * moved
-         */
-        if (env->imbalance < sds->busiest_load_per_task)
-                return fix_small_imbalance(env, sds);
-}
-/******* find_busiest_group() helpers end here *********************/
-/**
- * find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
- *
- * Also calculates the amount of weighted load which should be moved
- * to restore balance.
- *
- * @env: The load balancing environment.
- * @balance: Pointer to a variable indicating if this_cpu
- *      is the appropriate cpu to perform load balancing at this_level.
- *
- * Returns:     - the busiest group if imbalance exists.
- *              - If no imbalance and user has opted for power-savings balance,
- *                 return the least loaded group whose CPUs can be
- *                 put to idle by rebalancing its tasks onto our group.
- */
-static struct sched_group *
-find_busiest_group(struct lb_env *env, int *balance)
-{
-        struct sd_lb_stats sds;
-        memset(&sds, 0, sizeof(sds));
-        /*
-         * Compute the various statistics relavent for load balancing at
-         * this level.
-         */
-        update_sd_lb_stats(env, balance, &sds);
-        /*
-         * this_cpu is not the appropriate cpu to perform load balancing at
-         * this level.
-         */
-        if (!(*balance))
-                goto ret;
-        if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
-            check_asym_packing(env, &sds))
-                return sds.busiest;
-        /* There is no busy sibling group to pull tasks from */
-        if (!sds.busiest || sds.busiest_nr_running == 0)
-                goto out_balanced;
-        sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
-        /*
-         * If the busiest group is imbalanced the below checks don't
-         * work because they assumes all things are equal, which typically
-         * isn't true due to cpus_allowed constraints and the like.
-         */
-        if (sds.group_imb)
-                goto force_balance;
-        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-        if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
-                        !sds.busiest_has_capacity)
-                goto force_balance;
-        /*
-         * If the local group is more busy than the selected busiest group
-         * don't try and pull any tasks.
-         */
-        if (sds.this_load >= sds.max_load)
-                goto out_balanced;
-        /*
-         * Don't pull any tasks if this group is already above the domain
-         * average load.
-         */
-        if (sds.this_load >= sds.avg_load)
-                goto out_balanced;
-        if (env->idle == CPU_IDLE) {
-                /*
-                 * This cpu is idle. If the busiest group load doesn't
-                 * have more tasks than the number of available cpu's and
-                 * there is no imbalance between this and busiest group
-                 * wrt to idle cpu's, it is balanced.
-                 */
-                if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
-                    sds.busiest_nr_running <= sds.busiest_group_weight)
-                        goto out_balanced;
-        } else {
-                /*
-                 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
-                 * imbalance_pct to be conservative.
-                 */
-                if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
-                        goto out_balanced;
-        }
-force_balance:
-        /* Looks like there is an imbalance. Compute it */
-        calculate_imbalance(env, &sds);
-        return sds.busiest;
-out_balanced:
-ret:
-        env->imbalance = 0;
-        return NULL;
-}
-/*
- * find_busiest_queue - find the busiest runqueue among the cpus in group.
- */
-static struct rq *find_busiest_queue(struct lb_env *env,
-                                     struct sched_group *group)
-{
-        struct rq *busiest = NULL, *rq;
-        unsigned long max_load = 0;
-        int i;
-        for_each_cpu(i, sched_group_cpus(group)) {
-                unsigned long power = power_of(i);
-                unsigned long capacity = DIV_ROUND_CLOSEST(power,
-                                                           SCHED_POWER_SCALE);
-                unsigned long wl;
-                if (!capacity)
-                        capacity = fix_small_capacity(env->sd, group);
-                if (!cpumask_test_cpu(i, env->cpus))
-                        continue;
-                rq = cpu_rq(i);
-                wl = weighted_cpuload(i);
-                /*
-                 * When comparing with imbalance, use weighted_cpuload()
-                 * which is not scaled with the cpu power.
-                 */
-                if (capacity && rq->nr_running == 1 && wl > env->imbalance)
-                        continue;
-                /*
-                 * For the load comparisons with the other cpu's, consider
-                 * the weighted_cpuload() scaled with the cpu power, so that
-                 * the load can be moved away from the cpu that is potentially
-                 * running at a lower capacity.
-                 */
-                wl = (wl * SCHED_POWER_SCALE) / power;
-                if (wl > max_load) {
-                        max_load = wl;
-                        busiest = rq;
-                }
-        }
-        return busiest;
-}
-/*
- * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
- * so long as it is large enough.
- */
-#define MAX_PINNED_INTERVAL     512
-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct lb_env *env)
-{
-        struct sched_domain *sd = env->sd;
-        if (env->idle == CPU_NEWLY_IDLE) {
-                /*
-                 * ASYM_PACKING needs to force migrate tasks from busy but
-                 * higher numbered CPUs in order to pack all tasks in the
-                 * lowest numbered CPUs.
-                 */
-                if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
-                        return 1;
-        }
-        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
-}
-static int active_load_balance_cpu_stop(void *data);
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- */
-static int load_balance(int this_cpu, struct rq *this_rq,
-                        struct sched_domain *sd, enum cpu_idle_type idle,
-                        int *balance)
-{
-        int ld_moved, cur_ld_moved, active_balance = 0;
-        int lb_iterations, max_lb_iterations;
-        struct sched_group *group;
-        struct rq *busiest;
-        unsigned long flags;
-        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        struct lb_env env = {
-                .sd             = sd,
-                .dst_cpu        = this_cpu,
-                .dst_rq         = this_rq,
-                .dst_grpmask    = sched_group_cpus(sd->groups),
-                .idle           = idle,
-                .loop_break     = sched_nr_migrate_break,
-                .cpus           = cpus,
-        };
-        cpumask_copy(cpus, cpu_active_mask);
-        max_lb_iterations = cpumask_weight(env.dst_grpmask);
-        schedstat_inc(sd, lb_count[idle]);
-redo:
-        group = find_busiest_group(&env, balance);
-        if (*balance == 0)
-                goto out_balanced;
-        if (!group) {
-                schedstat_inc(sd, lb_nobusyg[idle]);
-                goto out_balanced;
-        }
-        busiest = find_busiest_queue(&env, group);
-        if (!busiest) {
-                schedstat_inc(sd, lb_nobusyq[idle]);
-                goto out_balanced;
-        }
-        BUG_ON(busiest == env.dst_rq);
-        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
-        ld_moved = 0;
-        lb_iterations = 1;
-        if (busiest->nr_running > 1) {
-                /*
-                 * Attempt to move tasks. If find_busiest_group has found
-                 * an imbalance but busiest->nr_running <= 1, the group is
-                 * still unbalanced. ld_moved simply stays zero, so it is
-                 * correctly treated as an imbalance.
-                 */
-                env.flags |= LBF_ALL_PINNED;
-                env.src_cpu   = busiest->cpu;
-                env.src_rq    = busiest;
-                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
-                update_h_load(env.src_cpu);
-more_balance:
-                local_irq_save(flags);
-                double_rq_lock(env.dst_rq, busiest);
-                /*
-                 * cur_ld_moved - load moved in current iteration
-                 * ld_moved     - cumulative load moved across iterations
-                 */
-                cur_ld_moved = move_tasks(&env);
-                ld_moved += cur_ld_moved;
-                double_rq_unlock(env.dst_rq, busiest);
-                local_irq_restore(flags);
-                if (env.flags & LBF_NEED_BREAK) {
-                        env.flags &= ~LBF_NEED_BREAK;
-                        goto more_balance;
-                }
-                /*
-                 * some other cpu did the load balance for us.
-                 */
-                if (cur_ld_moved && env.dst_cpu != smp_processor_id())
-                        resched_cpu(env.dst_cpu);
-                /*
-                 * Revisit (affine) tasks on src_cpu that couldn't be moved to
-                 * us and move them to an alternate dst_cpu in our sched_group
-                 * where they can run. The upper limit on how many times we
-                 * iterate on same src_cpu is dependent on number of cpus in our
-                 * sched_group.
-                 *
-                 * This changes load balance semantics a bit on who can move
-                 * load to a given_cpu. In addition to the given_cpu itself
-                 * (or a ilb_cpu acting on its behalf where given_cpu is
-                 * nohz-idle), we now have balance_cpu in a position to move
-                 * load to given_cpu. In rare situations, this may cause
-                 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
-                 * _independently_ and at _same_ time to move some load to
-                 * given_cpu) causing exceess load to be moved to given_cpu.
-                 * This however should not happen so much in practice and
-                 * moreover subsequent load balance cycles should correct the
-                 * excess load moved.
-                 */
-                if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
-                                lb_iterations++ < max_lb_iterations) {
-                        env.dst_rq       = cpu_rq(env.new_dst_cpu);
-                        env.dst_cpu      = env.new_dst_cpu;
-                        env.flags       &= ~LBF_SOME_PINNED;
-                        env.loop         = 0;
-                        env.loop_break   = sched_nr_migrate_break;
-                        /*
-                         * Go back to "more_balance" rather than "redo" since we
-                         * need to continue with same src_cpu.
-                         */
-                        goto more_balance;
-                }
-                /* All tasks on this runqueue were pinned by CPU affinity */
-                if (unlikely(env.flags & LBF_ALL_PINNED)) {
-                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                        if (!cpumask_empty(cpus)) {
-                                env.loop = 0;
-                                env.loop_break = sched_nr_migrate_break;
-                                goto redo;
-                        }
-                        goto out_balanced;
-                }
-        }
-        if (!ld_moved) {
-                schedstat_inc(sd, lb_failed[idle]);
-                /*
-                 * Increment the failure counter only on periodic balance.
-                 * We do not want newidle balance, which can be very
-                 * frequent, pollute the failure counter causing
-                 * excessive cache_hot migrations and active balances.
-                 */
-                if (idle != CPU_NEWLY_IDLE)
-                        sd->nr_balance_failed++;
-                if (need_active_balance(&env)) {
-                        raw_spin_lock_irqsave(&busiest->lock, flags);
-                        /* don't kick the active_load_balance_cpu_stop,
-                         * if the curr task on busiest cpu can't be
-                         * moved to this_cpu
-                         */
-                        if (!cpumask_test_cpu(this_cpu,
-                                        tsk_cpus_allowed(busiest->curr))) {
-                                raw_spin_unlock_irqrestore(&busiest->lock,
-                                                            flags);
-                                env.flags |= LBF_ALL_PINNED;
-                                goto out_one_pinned;
-                        }
-                        /*
-                         * ->active_balance synchronizes accesses to
-                         * ->active_balance_work.  Once set, it's cleared
-                         * only after active load balance is finished.
-                         */
-                        if (!busiest->active_balance) {
-                                busiest->active_balance = 1;
-                                busiest->push_cpu = this_cpu;
-                                active_balance = 1;
-                        }
-                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
-                        if (active_balance) {
-                                stop_one_cpu_nowait(cpu_of(busiest),
-                                        active_load_balance_cpu_stop, busiest,
-                                        &busiest->active_balance_work);
-                        }
-                        /*
-                         * We've kicked active balancing, reset the failure
-                         * counter.
-                         */
-                        sd->nr_balance_failed = sd->cache_nice_tries+1;
-                }
-        } else
-                sd->nr_balance_failed = 0;
-        if (likely(!active_balance)) {
-                /* We were unbalanced, so reset the balancing interval */
-                sd->balance_interval = sd->min_interval;
-        } else {
-                /*
-                 * If we've begun active balancing, start to back off. This
-                 * case may not be covered by the all_pinned logic if there
-                 * is only 1 task on the busy runqueue (because we don't call
-                 * move_tasks).
-                 */
-                if (sd->balance_interval < sd->max_interval)
-                        sd->balance_interval *= 2;
-        }
-        goto out;
-out_balanced:
-        schedstat_inc(sd, lb_balanced[idle]);
-        sd->nr_balance_failed = 0;
-out_one_pinned:
-        /* tune up the balancing interval */
-        if (((env.flags & LBF_ALL_PINNED) &&
-                        sd->balance_interval < MAX_PINNED_INTERVAL) ||
-                        (sd->balance_interval < sd->max_interval))
-                sd->balance_interval *= 2;
-        ld_moved = 0;
-out:
-        return ld_moved;
-}
-/*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
- */
-void idle_balance(int this_cpu, struct rq *this_rq)
-{
-        struct sched_domain *sd;
-        int pulled_task = 0;
-        unsigned long next_balance = jiffies + HZ;
-        this_rq->idle_stamp = this_rq->clock;
-        if (this_rq->avg_idle < sysctl_sched_migration_cost)
-                return;
-        update_rq_runnable_avg(this_rq, 1);
-        /*
-         * Drop the rq->lock, but keep IRQ/preempt disabled.
-         */
-        raw_spin_unlock(&this_rq->lock);
-        update_blocked_averages(this_cpu);
-        rcu_read_lock();
-        for_each_domain(this_cpu, sd) {
-                unsigned long interval;
-                int balance = 1;
-                if (!(sd->flags & SD_LOAD_BALANCE))
-                        continue;
-                if (sd->flags & SD_BALANCE_NEWIDLE) {
-                        /* If we've pulled tasks over stop searching: */
-                        pulled_task = load_balance(this_cpu, this_rq,
-                                                   sd, CPU_NEWLY_IDLE, &balance);
-                }
-                interval = msecs_to_jiffies(sd->balance_interval);
-                if (time_after(next_balance, sd->last_balance + interval))
-                        next_balance = sd->last_balance + interval;
-                if (pulled_task) {
-                        this_rq->idle_stamp = 0;
-                        break;
-                }
-        }
-        rcu_read_unlock();
-        raw_spin_lock(&this_rq->lock);
-        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
-                /*
-                 * We are going idle. next_balance may be set based on
-                 * a busy processor. So reset next_balance.
-                 */
-                this_rq->next_balance = next_balance;
-        }
-}
-/*
- * active_load_balance_cpu_stop is run by cpu stopper. It pushes
- * running tasks off the busiest CPU onto idle CPUs. It requires at
- * least 1 task to be running on each physical CPU where possible, and
- * avoids physical / logical imbalances.
- */
-static int active_load_balance_cpu_stop(void *data)
-{
-        struct rq *busiest_rq = data;
-        int busiest_cpu = cpu_of(busiest_rq);
-        int target_cpu = busiest_rq->push_cpu;
-        struct rq *target_rq = cpu_rq(target_cpu);
-        struct sched_domain *sd;
-        raw_spin_lock_irq(&busiest_rq->lock);
-        /* make sure the requested cpu hasn't gone down in the meantime */
-        if (unlikely(busiest_cpu != smp_processor_id() ||
-                     !busiest_rq->active_balance))
-                goto out_unlock;
-        /* Is there any task to move? */
-        if (busiest_rq->nr_running <= 1)
-                goto out_unlock;
-        /*
-         * This condition is "impossible", if it occurs
-         * we need to fix it. Originally reported by
-         * Bjorn Helgaas on a 128-cpu setup.
-         */
-        BUG_ON(busiest_rq == target_rq);
-        /* move a task from busiest_rq to target_rq */
-        double_lock_balance(busiest_rq, target_rq);
-        /* Search for an sd spanning us and the target CPU. */
-        rcu_read_lock();
-        for_each_domain(target_cpu, sd) {
-                if ((sd->flags & SD_LOAD_BALANCE) &&
-                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
-                                break;
-        }
-        if (likely(sd)) {
-                struct lb_env env = {
-                        .sd             = sd,
-                        .dst_cpu        = target_cpu,
-                        .dst_rq         = target_rq,
-                        .src_cpu        = busiest_rq->cpu,
-                        .src_rq         = busiest_rq,
-                        .idle           = CPU_IDLE,
-                };
-                schedstat_inc(sd, alb_count);
-                if (move_one_task(&env))
-                        schedstat_inc(sd, alb_pushed);
-                else
-                        schedstat_inc(sd, alb_failed);
-        }
-        rcu_read_unlock();
-        double_unlock_balance(busiest_rq, target_rq);
-out_unlock:
-        busiest_rq->active_balance = 0;
-        raw_spin_unlock_irq(&busiest_rq->lock);
-        return 0;
-}
-#ifdef CONFIG_NO_HZ
-/*
- * idle load balancing details
- * - When one of the busy CPUs notice that there may be an idle rebalancing
- *   needed, they will kick the idle load balancer, which then does idle
- *   load balancing for all the idle CPUs.
- */
-static struct {
-        cpumask_var_t idle_cpus_mask;
-        atomic_t nr_cpus;
-        unsigned long next_balance;     /* in jiffy units */
-} nohz ____cacheline_aligned;
-static inline int find_new_ilb(int call_cpu)
-{
-        int ilb = cpumask_first(nohz.idle_cpus_mask);
-        if (ilb < nr_cpu_ids && idle_cpu(ilb))
-                return ilb;
-        return nr_cpu_ids;
-}
-/*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
- * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
- * CPU (if there is one).
- */
-static void nohz_balancer_kick(int cpu)
-{
-        int ilb_cpu;
-        nohz.next_balance++;
-        ilb_cpu = find_new_ilb(cpu);
-        if (ilb_cpu >= nr_cpu_ids)
-                return;
-        if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
-                return;
-        /*
-         * Use smp_send_reschedule() instead of resched_cpu().
-         * This way we generate a sched IPI on the target cpu which
-         * is idle. And the softirq performing nohz idle load balance
-         * will be run before returning from the IPI.
-         */
-        smp_send_reschedule(ilb_cpu);
-        return;
-}
-static inline void nohz_balance_exit_idle(int cpu)
-{
-        if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
-                cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
-                atomic_dec(&nohz.nr_cpus);
-                clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
-        }
-}
-static inline void set_cpu_sd_state_busy(void)
-{
-        struct sched_domain *sd;
-        int cpu = smp_processor_id();
-        if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-                return;
-        clear_bit(NOHZ_IDLE, nohz_flags(cpu));
-        rcu_read_lock();
-        for_each_domain(cpu, sd)
-                atomic_inc(&sd->groups->sgp->nr_busy_cpus);
-        rcu_read_unlock();
-}
-void set_cpu_sd_state_idle(void)
-{
-        struct sched_domain *sd;
-        int cpu = smp_processor_id();
-        if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-                return;
-        set_bit(NOHZ_IDLE, nohz_flags(cpu));
-        rcu_read_lock();
-        for_each_domain(cpu, sd)
-                atomic_dec(&sd->groups->sgp->nr_busy_cpus);
-        rcu_read_unlock();
-}
-/*
- * This routine will record that the cpu is going idle with tick stopped.
- * This info will be used in performing idle load balancing in the future.
- */
-void nohz_balance_enter_idle(int cpu)
-{
-        /*
-         * If this cpu is going down, then nothing needs to be done.
-         */
-        if (!cpu_active(cpu))
-                return;
-        if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
-                return;
-        cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
-        atomic_inc(&nohz.nr_cpus);
-        set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
-}
-static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
-                                        unsigned long action, void *hcpu)
-{
-        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_DYING:
-                nohz_balance_exit_idle(smp_processor_id());
-                return NOTIFY_OK;
-        default:
-                return NOTIFY_DONE;
-        }
-}
-#endif
-static DEFINE_SPINLOCK(balancing);
-/*
- * Scale the max load_balance interval with the number of CPUs in the system.
- * This trades load-balance latency on larger machines for less cross talk.
- */
-void update_max_interval(void)
-{
-        max_load_balance_interval = HZ*num_online_cpus()/10;
-}
-/*
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
- */
-static void rebalance_domains(int cpu, enum cpu_idle_type idle)
-{
-        int balance = 1;
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long interval;
-        struct sched_domain *sd;
-        /* Earliest time when we have to do rebalance again */
-        unsigned long next_balance = jiffies + 60*HZ;
-        int update_next_balance = 0;
-        int need_serialize;
-        update_blocked_averages(cpu);
-        rcu_read_lock();
-        for_each_domain(cpu, sd) {
-                if (!(sd->flags & SD_LOAD_BALANCE))
-                        continue;
-                interval = sd->balance_interval;
-                if (idle != CPU_IDLE)
-                        interval *= sd->busy_factor;
-                /* scale ms to jiffies */
-                interval = msecs_to_jiffies(interval);
-                interval = clamp(interval, 1UL, max_load_balance_interval);
-                need_serialize = sd->flags & SD_SERIALIZE;
-                if (need_serialize) {
-                        if (!spin_trylock(&balancing))
-                                goto out;
-                }
-                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                        if (load_balance(cpu, rq, sd, idle, &balance)) {
-                                /*
-                                 * We've pulled tasks over so either we're no
-                                 * longer idle.
-                                 */
-                                idle = CPU_NOT_IDLE;
-                        }
-                        sd->last_balance = jiffies;
-                }
-                if (need_serialize)
-                        spin_unlock(&balancing);
-out:
-                if (time_after(next_balance, sd->last_balance + interval)) {
-                        next_balance = sd->last_balance + interval;
-                        update_next_balance = 1;
-                }
-                /*
-                 * Stop the load balance at this level. There is another
-                 * CPU in our sched group which is doing load balancing more
-                 * actively.
-                 */
-                if (!balance)
-                        break;
-        }
-        rcu_read_unlock();
-        /*
-         * next_balance will be updated only when there is a need.
-         * When the cpu is attached to null domain for ex, it will not be
-         * updated.
-         */
-        if (likely(update_next_balance))
-                rq->next_balance = next_balance;
-}
-#ifdef CONFIG_NO_HZ
-/*
- * In CONFIG_NO_HZ case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
- */
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
-{
-        struct rq *this_rq = cpu_rq(this_cpu);
-        struct rq *rq;
-        int balance_cpu;
-        if (idle != CPU_IDLE ||
-            !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
-                goto end;
-        for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
-                if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
-                        continue;
-                /*
-                 * If this cpu gets work to do, stop the load balancing
-                 * work being done for other cpus. Next load
-                 * balancing owner will pick it up.
-                 */
-                if (need_resched())
-                        break;
-                rq = cpu_rq(balance_cpu);
-                raw_spin_lock_irq(&rq->lock);
-                update_rq_clock(rq);
-                update_idle_cpu_load(rq);
-                raw_spin_unlock_irq(&rq->lock);
-                rebalance_domains(balance_cpu, CPU_IDLE);
-                if (time_after(this_rq->next_balance, rq->next_balance))
-                        this_rq->next_balance = rq->next_balance;
-        }
-        nohz.next_balance = this_rq->next_balance;
-end:
-        clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
-}
-/*
- * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu is the system.
- *   - This rq has more than one task.
- *   - At any scheduler domain level, this cpu's scheduler group has multiple
- *     busy cpu's exceeding the group's power.
- *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
- *     domain span are idle.
- */
-static inline int nohz_kick_needed(struct rq *rq, int cpu)
-{
-        unsigned long now = jiffies;
-        struct sched_domain *sd;
-        if (unlikely(idle_cpu(cpu)))
-                return 0;
-       /*
-        * We may be recently in ticked or tickless idle mode. At the first
-        * busy tick after returning from idle, we will update the busy stats.
-        */
-        set_cpu_sd_state_busy();
-        nohz_balance_exit_idle(cpu);
-        /*
-         * None are in tickless mode and hence no need for NOHZ idle load
-         * balancing.
-         */
-        if (likely(!atomic_read(&nohz.nr_cpus)))
-                return 0;
-        if (time_before(now, nohz.next_balance))
-                return 0;
-        if (rq->nr_running >= 2)
-                goto need_kick;
-        rcu_read_lock();
-        for_each_domain(cpu, sd) {
-                struct sched_group *sg = sd->groups;
-                struct sched_group_power *sgp = sg->sgp;
-                int nr_busy = atomic_read(&sgp->nr_busy_cpus);
-                if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
-                        goto need_kick_unlock;
-                if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
-                    && (cpumask_first_and(nohz.idle_cpus_mask,
-                                          sched_domain_span(sd)) < cpu))
-                        goto need_kick_unlock;
-                if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
-                        break;
-        }
-        rcu_read_unlock();
-        return 0;
-need_kick_unlock:
-        rcu_read_unlock();
-need_kick:
-        return 1;
-}
-#else
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
-#endif
-/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
- */
-static void run_rebalance_domains(struct softirq_action *h)
-{
-        int this_cpu = smp_processor_id();
-        struct rq *this_rq = cpu_rq(this_cpu);
-        enum cpu_idle_type idle = this_rq->idle_balance ?
-                                                CPU_IDLE : CPU_NOT_IDLE;
-        rebalance_domains(this_cpu, idle);
-        /*
-         * If this cpu has a pending nohz_balance_kick, then do the
-         * balancing on behalf of the other idle cpus whose ticks are
-         * stopped.
-         */
-        nohz_idle_balance(this_cpu, idle);
-}
-static inline int on_null_domain(int cpu)
-{
-        return !rcu_dereference_sched(cpu_rq(cpu)->sd);
-}
-/*
- * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- */
-void trigger_load_balance(struct rq *rq, int cpu)
-{
-        /* Don't need to rebalance while attached to NULL domain */
-        if (time_after_eq(jiffies, rq->next_balance) &&
-            likely(!on_null_domain(cpu)))
-                raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ
-        if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
-                nohz_balancer_kick(cpu);
-#endif
-}
-static void rq_online_fair(struct rq *rq)
-{
-        update_sysctl();
-}
-static void rq_offline_fair(struct rq *rq)
-{
-        update_sysctl();
-        /* Ensure any throttled groups are reachable by pick_next_task */
-        unthrottle_offline_cfs_rqs(rq);
-}
-#endif /* CONFIG_SMP */
-/*
- * scheduler tick hitting a task of our scheduling class:
- */
-static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
-{
-        struct cfs_rq *cfs_rq;
-        struct sched_entity *se = &curr->se;
-        for_each_sched_entity(se) {
-                cfs_rq = cfs_rq_of(se);
-                entity_tick(cfs_rq, se, queued);
-        }
-        if (sched_feat_numa(NUMA))
-                task_tick_numa(rq, curr);
-        update_rq_runnable_avg(rq, 1);
-}
-/*
- * called on fork with the child task as argument from the parent's context
- *  - child not yet on the tasklist
- *  - preemption disabled
- */
-static void task_fork_fair(struct task_struct *p)
-{
-        struct cfs_rq *cfs_rq;
-        struct sched_entity *se = &p->se, *curr;
-        int this_cpu = smp_processor_id();
-        struct rq *rq = this_rq();
-        unsigned long flags;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        update_rq_clock(rq);
-        cfs_rq = task_cfs_rq(current);
-        curr = cfs_rq->curr;
-        if (unlikely(task_cpu(p) != this_cpu)) {
-                rcu_read_lock();
-                __set_task_cpu(p, this_cpu);
-                rcu_read_unlock();
-        }
-        update_curr(cfs_rq);
-        if (curr)
-                se->vruntime = curr->vruntime;
-        place_entity(cfs_rq, se, 1);
-        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
-                /*
-                 * Upon rescheduling, sched_class::put_prev_task() will place
-                 * 'current' within the tree based on its new key value.
-                 */
-                swap(curr->vruntime, se->vruntime);
-                resched_task(rq->curr);
-        }
-        se->vruntime -= cfs_rq->min_vruntime;
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-/*
- * Priority of the task has changed. Check to see if we preempt
- * the current task.
- */
-static void
-prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
-{
-        if (!p->se.on_rq)
-                return;
-        /*
-         * Reschedule if we are currently running on this runqueue and
-         * our priority decreased, or if we are not currently running on
-         * this runqueue and our priority is higher than the current's
-         */
-        if (rq->curr == p) {
-                if (p->prio > oldprio)
-                        resched_task(rq->curr);
-        } else
-                check_preempt_curr(rq, p, 0);
-}
-static void switched_from_fair(struct rq *rq, struct task_struct *p)
-{
-        struct sched_entity *se = &p->se;
-        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        /*
-         * Ensure the task's vruntime is normalized, so that when its
-         * switched back to the fair class the enqueue_entity(.flags=0) will
-         * do the right thing.
-         *
-         * If it was on_rq, then the dequeue_entity(.flags=0) will already
-         * have normalized the vruntime, if it was !on_rq, then only when
-         * the task is sleeping will it still have non-normalized vruntime.
-         */
-        if (!se->on_rq && p->state != TASK_RUNNING) {
-                /*
-                 * Fix up our vruntime so that the current sleep doesn't
-                 * cause 'unlimited' sleep bonus.
-                 */
-                place_entity(cfs_rq, se, 0);
-                se->vruntime -= cfs_rq->min_vruntime;
-        }
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
-        /*
-        * Remove our load from contribution when we leave sched_fair
-        * and ensure we don't carry in an old decay_count if we
-        * switch back.
-        */
-        if (p->se.avg.decay_count) {
-                struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
-                __synchronize_entity_decay(&p->se);
-                subtract_blocked_load_contrib(cfs_rq,
-                                p->se.avg.load_avg_contrib);
-        }
-#endif
-}
-/*
- * We switched to the sched_fair class.
- */
-static void switched_to_fair(struct rq *rq, struct task_struct *p)
-{
-        if (!p->se.on_rq)
-                return;
-        /*
-         * We were most likely switched from sched_rt, so
-         * kick off the schedule if running, otherwise just see
-         * if we can still preempt the current task.
-         */
-        if (rq->curr == p)
-                resched_task(rq->curr);
-        else
-                check_preempt_curr(rq, p, 0);
-}
-/* Account for a task changing its policy or group.
- *
- * This routine is mostly called to set cfs_rq->curr field when a task
- * migrates between groups/classes.
- */
-static void set_curr_task_fair(struct rq *rq)
-{
-        struct sched_entity *se = &rq->curr->se;
-        for_each_sched_entity(se) {
-                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-                set_next_entity(cfs_rq, se);
-                /* ensure bandwidth has been allocated on our new cfs_rq */
-                account_cfs_rq_runtime(cfs_rq, 0);
-        }
-}
-void init_cfs_rq(struct cfs_rq *cfs_rq)
-{
-        cfs_rq->tasks_timeline = RB_ROOT;
-        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifndef CONFIG_64BIT
-        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
-        atomic64_set(&cfs_rq->decay_counter, 1);
-        atomic64_set(&cfs_rq->removed_load, 0);
-#endif
-}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int on_rq)
-{
-        struct cfs_rq *cfs_rq;
-        /*
-         * If the task was not on the rq at the time of this cgroup movement
-         * it must have been asleep, sleeping tasks keep their ->vruntime
-         * absolute on their old rq until wakeup (needed for the fair sleeper
-         * bonus in place_entity()).
-         *
-         * If it was on the rq, we've just 'preempted' it, which does convert
-         * ->vruntime to a relative base.
-         *
-         * Make sure both cases convert their relative position when migrating
-         * to another cgroup's rq. This does somewhat interfere with the
-         * fair sleeper stuff for the first placement, but who cares.
-         */
-        /*
-         * When !on_rq, vruntime of the task has usually NOT been normalized.
-         * But there are some cases where it has already been normalized:
-         *
-         * - Moving a forked child which is waiting for being woken up by
-         *   wake_up_new_task().
-         * - Moving a task which has been woken up by try_to_wake_up() and
-         *   waiting for actually being woken up by sched_ttwu_pending().
-         *
-         * To prevent boost or penalty in the new cfs_rq caused by delta
-         * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
-         */
-        if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
-                on_rq = 1;
-        if (!on_rq)
-                p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
-        set_task_rq(p, task_cpu(p));
-        if (!on_rq) {
-                cfs_rq = cfs_rq_of(&p->se);
-                p->se.vruntime += cfs_rq->min_vruntime;
-#ifdef CONFIG_SMP
-                /*
-                 * migrate_task_rq_fair() will have removed our previous
-                 * contribution, but we must synchronize for ongoing future
-                 * decay.
-                 */
-                p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
-                cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
-#endif
-        }
-}
-void free_fair_sched_group(struct task_group *tg)
-{
-        int i;
-        destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
-        for_each_possible_cpu(i) {
-                if (tg->cfs_rq)
-                        kfree(tg->cfs_rq[i]);
-                if (tg->se)
-                        kfree(tg->se[i]);
-        }
-        kfree(tg->cfs_rq);
-        kfree(tg->se);
-}
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
-{
-        struct cfs_rq *cfs_rq;
-        struct sched_entity *se;
-        int i;
-        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
-        if (!tg->cfs_rq)
-                goto err;
-        tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
-        if (!tg->se)
-                goto err;
-        tg->shares = NICE_0_LOAD;
-        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
-        for_each_possible_cpu(i) {
-                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
-                                      GFP_KERNEL, cpu_to_node(i));
-                if (!cfs_rq)
-                        goto err;
-                se = kzalloc_node(sizeof(struct sched_entity),
-                                  GFP_KERNEL, cpu_to_node(i));
-                if (!se)
-                        goto err_free_rq;
-                init_cfs_rq(cfs_rq);
-                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
-        }
-        return 1;
-err_free_rq:
-        kfree(cfs_rq);
-err:
-        return 0;
-}
-void unregister_fair_sched_group(struct task_group *tg, int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long flags;
-        /*
-        * Only empty task groups can be destroyed; so we can speculatively
-        * check on_list without danger of it being re-added.
-        */
-        if (!tg->cfs_rq[cpu]->on_list)
-                return;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-                        struct sched_entity *se, int cpu,
-                        struct sched_entity *parent)
-{
-        struct rq *rq = cpu_rq(cpu);
-        cfs_rq->tg = tg;
-        cfs_rq->rq = rq;
-        init_cfs_rq_runtime(cfs_rq);
-        tg->cfs_rq[cpu] = cfs_rq;
-        tg->se[cpu] = se;
-        /* se could be NULL for root_task_group */
-        if (!se)
-                return;
-        if (!parent)
-                se->cfs_rq = &rq->cfs;
-        else
-                se->cfs_rq = parent->my_q;
-        se->my_q = cfs_rq;
-        update_load_set(&se->load, 0);
-        se->parent = parent;
-}
-static DEFINE_MUTEX(shares_mutex);
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
-{
-        int i;
-        unsigned long flags;
-        /*
-         * We can't change the weight of the root cgroup.
-         */
-        if (!tg->se[0])
-                return -EINVAL;
-        shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
-        mutex_lock(&shares_mutex);
-        if (tg->shares == shares)
-                goto done;
-        tg->shares = shares;
-        for_each_possible_cpu(i) {
-                struct rq *rq = cpu_rq(i);
-                struct sched_entity *se;
-                se = tg->se[i];
-                /* Propagate contribution to hierarchy */
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                for_each_sched_entity(se)
-                        update_cfs_shares(group_cfs_rq(se));
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-        }
-done:
-        mutex_unlock(&shares_mutex);
-        return 0;
-}
-#else /* CONFIG_FAIR_GROUP_SCHED */
-void free_fair_sched_group(struct task_group *tg) { }
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
-{
-        return 1;
-}
-void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
-{
-        struct sched_entity *se = &task->se;
-        unsigned int rr_interval = 0;
-        /*
-         * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
-         * idle runqueue:
-         */
-        if (rq->cfs.load.weight)
-                rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
-        return rr_interval;
-}
-/*
- * All the scheduling class methods:
- */
-const struct sched_class fair_sched_class = {
-        .next                   = &idle_sched_class,
-        .enqueue_task           = enqueue_task_fair,
-        .dequeue_task           = dequeue_task_fair,
-        .yield_task             = yield_task_fair,
-        .yield_to_task          = yield_to_task_fair,
-        .check_preempt_curr     = check_preempt_wakeup,
-        .pick_next_task         = pick_next_task_fair,
-        .put_prev_task          = put_prev_task_fair,
-#ifdef CONFIG_SMP
-        .select_task_rq         = select_task_rq_fair,
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        .migrate_task_rq        = migrate_task_rq_fair,
-#endif
-        .rq_online              = rq_online_fair,
-        .rq_offline             = rq_offline_fair,
-        .task_waking            = task_waking_fair,
-#endif
-        .set_curr_task          = set_curr_task_fair,
-        .task_tick              = task_tick_fair,
-        .task_fork              = task_fork_fair,
-        .prio_changed           = prio_changed_fair,
-        .switched_from          = switched_from_fair,
-        .switched_to            = switched_to_fair,
-        .get_rr_interval        = get_rr_interval_fair,
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        .task_move_group        = task_move_group_fair,
-#endif
-};
-#ifdef CONFIG_SCHED_DEBUG
-void print_cfs_stats(struct seq_file *m, int cpu)
-{
-        struct cfs_rq *cfs_rq;
-        rcu_read_lock();
-        for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
-                print_cfs_rq(m, cpu, cfs_rq);
-        rcu_read_unlock();
-}
-#endif
-__init void init_sched_fair_class(void)
-{
-#ifdef CONFIG_SMP
-        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
-#ifdef CONFIG_NO_HZ
-        nohz.next_balance = jiffies;
-        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
-        cpu_notifier(sched_ilb_notifier, 0);
-#endif
-#endif /* SMP */
-}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
deleted file mode 100644
index 1ad1d2b5395..00000000000
--- a/kernel/sched/features.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Only give sleepers 50% of their service deficit. This allows
- * them to run sooner, but does not allow tons of sleepers to
- * rip the spread apart.
- */
-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
-/*
- * Place new tasks ahead so that they do not starve already running
- * tasks
- */
-SCHED_FEAT(START_DEBIT, true)
-/*
- * Prefer to schedule the task we woke last (assuming it failed
- * wakeup-preemption), since its likely going to consume data we
- * touched, increases cache locality.
- */
-SCHED_FEAT(NEXT_BUDDY, false)
-/*
- * Prefer to schedule the task that ran last (when we did
- * wake-preempt) as that likely will touch the same data, increases
- * cache locality.
- */
-SCHED_FEAT(LAST_BUDDY, true)
-/*
- * Consider buddies to be cache hot, decreases the likelyness of a
- * cache buddy being migrated away, increases cache locality.
- */
-SCHED_FEAT(CACHE_HOT_BUDDY, true)
-/*
- * Allow wakeup-time preemption of the current task:
- */
-SCHED_FEAT(WAKEUP_PREEMPTION, true)
-/*
- * Use arch dependent cpu power functions
- */
-SCHED_FEAT(ARCH_POWER, true)
-SCHED_FEAT(HRTICK, false)
-SCHED_FEAT(DOUBLE_TICK, false)
-SCHED_FEAT(LB_BIAS, true)
-/*
- * Spin-wait on mutex acquisition when the mutex owner is running on
- * another cpu -- assumes that when the owner is running, it will soon
- * release the lock. Decreases scheduling overhead.
- */
-SCHED_FEAT(OWNER_SPIN, true)
-/*
- * Decrement CPU power based on time not spent running tasks
- */
-SCHED_FEAT(NONTASK_POWER, true)
-/*
- * Queue remote wakeups on the target CPU and process them
- * using the scheduler IPI. Reduces rq->lock contention/bounces.
- */
-SCHED_FEAT(TTWU_QUEUE, true)
-SCHED_FEAT(FORCE_SD_OVERLAP, false)
-SCHED_FEAT(RT_RUNTIME_SHARE, true)
-SCHED_FEAT(LB_MIN, false)
-/*
- * Apply the automatic NUMA scheduling policy. Enabled automatically
- * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=. Allow PTE scanning to be forced on UMA machines
- * for debugging the core machinery.
- */
-#ifdef CONFIG_NUMA_BALANCING
-SCHED_FEAT(NUMA,        false)
-SCHED_FEAT(NUMA_FORCE,  false)
-#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
deleted file mode 100644
index b6baf370cae..00000000000
--- a/kernel/sched/idle_task.c
+++ /dev/null
@@ -1,98 +0,0 @@
-#include "sched.h"
-/*
- * idle-task scheduling class.
- *
- * (NOTE: these are not related to SCHED_IDLE tasks which are
- *  handled in sched/fair.c)
- */
-#ifdef CONFIG_SMP
-static int
-select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
-{
-        return task_cpu(p); /* IDLE tasks as never migrated */
-}
-#endif /* CONFIG_SMP */
-/*
- * Idle tasks are unconditionally rescheduled:
- */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
-{
-        resched_task(rq->idle);
-}
-static struct task_struct *pick_next_task_idle(struct rq *rq)
-{
-        schedstat_inc(rq, sched_goidle);
-        return rq->idle;
-}
-/*
- * It is not legal to sleep in the idle task - print a warning
- * message if some code attempts to do it:
- */
-static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
-{
-        raw_spin_unlock_irq(&rq->lock);
-        printk(KERN_ERR "bad: scheduling from the idle thread!\n");
-        dump_stack();
-        raw_spin_lock_irq(&rq->lock);
-}
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
-{
-}
-static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
-{
-}
-static void set_curr_task_idle(struct rq *rq)
-{
-}
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
-{
-        BUG();
-}
-static void
-prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
-{
-        BUG();
-}
-static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-{
-        return 0;
-}
-/*
- * Simple, special scheduling class for the per-CPU idle tasks:
- */
-const struct sched_class idle_sched_class = {
-        /* .next is NULL */
-        /* no enqueue/yield_task for idle tasks */
-        /* dequeue is not valid, we print a debug message there: */
-        .dequeue_task           = dequeue_task_idle,
-        .check_preempt_curr     = check_preempt_curr_idle,
-        .pick_next_task         = pick_next_task_idle,
-        .put_prev_task          = put_prev_task_idle,
-#ifdef CONFIG_SMP
-        .select_task_rq         = select_task_rq_idle,
-#endif
-        .set_curr_task          = set_curr_task_idle,
-        .task_tick              = task_tick_idle,
-        .get_rr_interval        = get_rr_interval_idle,
-        .prio_changed           = prio_changed_idle,
-        .switched_to            = switched_to_idle,
-};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
deleted file mode 100644
index 418feb01344..00000000000
--- a/kernel/sched/rt.c
+++ /dev/null
@@ -1,2094 +0,0 @@
-/*
- * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
- * policies)
- */
-#include "sched.h"
-#include <linux/slab.h>
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
-struct rt_bandwidth def_rt_bandwidth;
-static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
-{
-        struct rt_bandwidth *rt_b =
-                container_of(timer, struct rt_bandwidth, rt_period_timer);
-        ktime_t now;
-        int overrun;
-        int idle = 0;
-        for (;;) {
-                now = hrtimer_cb_get_time(timer);
-                overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-                if (!overrun)
-                        break;
-                idle = do_sched_rt_period_timer(rt_b, overrun);
-        }
-        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
-}
-void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
-{
-        rt_b->rt_period = ns_to_ktime(period);
-        rt_b->rt_runtime = runtime;
-        raw_spin_lock_init(&rt_b->rt_runtime_lock);
-        hrtimer_init(&rt_b->rt_period_timer,
-                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        rt_b->rt_period_timer.function = sched_rt_period_timer;
-}
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
-{
-        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
-                return;
-        if (hrtimer_active(&rt_b->rt_period_timer))
-                return;
-        raw_spin_lock(&rt_b->rt_runtime_lock);
-        start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
-        raw_spin_unlock(&rt_b->rt_runtime_lock);
-}
-void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
-{
-        struct rt_prio_array *array;
-        int i;
-        array = &rt_rq->active;
-        for (i = 0; i < MAX_RT_PRIO; i++) {
-                INIT_LIST_HEAD(array->queue + i);
-                __clear_bit(i, array->bitmap);
-        }
-        /* delimiter for bitsearch: */
-        __set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP
-        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-        rt_rq->highest_prio.next = MAX_RT_PRIO;
-        rt_rq->rt_nr_migratory = 0;
-        rt_rq->overloaded = 0;
-        plist_head_init(&rt_rq->pushable_tasks);
-#endif
-        rt_rq->rt_time = 0;
-        rt_rq->rt_throttled = 0;
-        rt_rq->rt_runtime = 0;
-        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-}
-#ifdef CONFIG_RT_GROUP_SCHED
-static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
-{
-        hrtimer_cancel(&rt_b->rt_period_timer);
-}
-#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
-{
-#ifdef CONFIG_SCHED_DEBUG
-        WARN_ON_ONCE(!rt_entity_is_task(rt_se));
-#endif
-        return container_of(rt_se, struct task_struct, rt);
-}
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
-        return rt_rq->rq;
-}
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
-        return rt_se->rt_rq;
-}
-void free_rt_sched_group(struct task_group *tg)
-{
-        int i;
-        if (tg->rt_se)
-                destroy_rt_bandwidth(&tg->rt_bandwidth);
-        for_each_possible_cpu(i) {
-                if (tg->rt_rq)
-                        kfree(tg->rt_rq[i]);
-                if (tg->rt_se)
-                        kfree(tg->rt_se[i]);
-        }
-        kfree(tg->rt_rq);
-        kfree(tg->rt_se);
-}
-void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-                struct sched_rt_entity *rt_se, int cpu,
-                struct sched_rt_entity *parent)
-{
-        struct rq *rq = cpu_rq(cpu);
-        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-        rt_rq->rt_nr_boosted = 0;
-        rt_rq->rq = rq;
-        rt_rq->tg = tg;
-        tg->rt_rq[cpu] = rt_rq;
-        tg->rt_se[cpu] = rt_se;
-        if (!rt_se)
-                return;
-        if (!parent)
-                rt_se->rt_rq = &rq->rt;
-        else
-                rt_se->rt_rq = parent->my_q;
-        rt_se->my_q = rt_rq;
-        rt_se->parent = parent;
-        INIT_LIST_HEAD(&rt_se->run_list);
-}
-int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
-{
-        struct rt_rq *rt_rq;
-        struct sched_rt_entity *rt_se;
-        int i;
-        tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
-        if (!tg->rt_rq)
-                goto err;
-        tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
-        if (!tg->rt_se)
-                goto err;
-        init_rt_bandwidth(&tg->rt_bandwidth,
-                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
-        for_each_possible_cpu(i) {
-                rt_rq = kzalloc_node(sizeof(struct rt_rq),
-                                     GFP_KERNEL, cpu_to_node(i));
-                if (!rt_rq)
-                        goto err;
-                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
-                                     GFP_KERNEL, cpu_to_node(i));
-                if (!rt_se)
-                        goto err_free_rq;
-                init_rt_rq(rt_rq, cpu_rq(i));
-                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
-                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
-        }
-        return 1;
-err_free_rq:
-        kfree(rt_rq);
-err:
-        return 0;
-}
-#else /* CONFIG_RT_GROUP_SCHED */
-#define rt_entity_is_task(rt_se) (1)
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
-{
-        return container_of(rt_se, struct task_struct, rt);
-}
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
-        return container_of(rt_rq, struct rq, rt);
-}
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
-        struct task_struct *p = rt_task_of(rt_se);
-        struct rq *rq = task_rq(p);
-        return &rq->rt;
-}
-void free_rt_sched_group(struct task_group *tg) { }
-int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
-{
-        return 1;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_SMP
-static inline int rt_overloaded(struct rq *rq)
-{
-        return atomic_read(&rq->rd->rto_count);
-}
-static inline void rt_set_overload(struct rq *rq)
-{
-        if (!rq->online)
-                return;
-        cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
-        /*
-         * Make sure the mask is visible before we set
-         * the overload count. That is checked to determine
-         * if we should look at the mask. It would be a shame
-         * if we looked at the mask, but the mask was not
-         * updated yet.
-         */
-        wmb();
-        atomic_inc(&rq->rd->rto_count);
-}
-static inline void rt_clear_overload(struct rq *rq)
-{
-        if (!rq->online)
-                return;
-        /* the order here really doesn't matter */
-        atomic_dec(&rq->rd->rto_count);
-        cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
-}
-static void update_rt_migration(struct rt_rq *rt_rq)
-{
-        if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
-                if (!rt_rq->overloaded) {
-                        rt_set_overload(rq_of_rt_rq(rt_rq));
-                        rt_rq->overloaded = 1;
-                }
-        } else if (rt_rq->overloaded) {
-                rt_clear_overload(rq_of_rt_rq(rt_rq));
-                rt_rq->overloaded = 0;
-        }
-}
-static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-        struct task_struct *p;
-        if (!rt_entity_is_task(rt_se))
-                return;
-        p = rt_task_of(rt_se);
-        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-        rt_rq->rt_nr_total++;
-        if (p->nr_cpus_allowed > 1)
-                rt_rq->rt_nr_migratory++;
-        update_rt_migration(rt_rq);
-}
-static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-        struct task_struct *p;
-        if (!rt_entity_is_task(rt_se))
-                return;
-        p = rt_task_of(rt_se);
-        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-        rt_rq->rt_nr_total--;
-        if (p->nr_cpus_allowed > 1)
-                rt_rq->rt_nr_migratory--;
-        update_rt_migration(rt_rq);
-}
-static inline int has_pushable_tasks(struct rq *rq)
-{
-        return !plist_head_empty(&rq->rt.pushable_tasks);
-}
-static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
-        plist_node_init(&p->pushable_tasks, p->prio);
-        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
-        /* Update the highest prio pushable task */
-        if (p->prio < rq->rt.highest_prio.next)
-                rq->rt.highest_prio.next = p->prio;
-}
-static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
-        /* Update the new highest prio pushable task */
-        if (has_pushable_tasks(rq)) {
-                p = plist_first_entry(&rq->rt.pushable_tasks,
-                                      struct task_struct, pushable_tasks);
-                rq->rt.highest_prio.next = p->prio;
-        } else
-                rq->rt.highest_prio.next = MAX_RT_PRIO;
-}
-#else
-static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
-static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-#endif /* CONFIG_SMP */
-static inline int on_rt_rq(struct sched_rt_entity *rt_se)
-{
-        return !list_empty(&rt_se->run_list);
-}
-#ifdef CONFIG_RT_GROUP_SCHED
-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
-{
-        if (!rt_rq->tg)
-                return RUNTIME_INF;
-        return rt_rq->rt_runtime;
-}
-static inline u64 sched_rt_period(struct rt_rq *rt_rq)
-{
-        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
-}
-typedef struct task_group *rt_rq_iter_t;
-static inline struct task_group *next_task_group(struct task_group *tg)
-{
-        do {
-                tg = list_entry_rcu(tg->list.next,
-                        typeof(struct task_group), list);
-        } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
-        if (&tg->list == &task_groups)
-                tg = NULL;
-        return tg;
-}
-#define for_each_rt_rq(rt_rq, iter, rq)                                 \
-        for (iter = container_of(&task_groups, typeof(*iter), list);    \
-                (iter = next_task_group(iter)) &&                       \
-                (rt_rq = iter->rt_rq[cpu_of(rq)]);)
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-        list_add_rcu(&rt_rq->leaf_rt_rq_list,
-                        &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
-}
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-        list_del_rcu(&rt_rq->leaf_rt_rq_list);
-}
-#define for_each_leaf_rt_rq(rt_rq, rq) \
-        list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
-#define for_each_sched_rt_entity(rt_se) \
-        for (; rt_se; rt_se = rt_se->parent)
-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
-{
-        return rt_se->my_q;
-}
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
-static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
-{
-        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-        struct sched_rt_entity *rt_se;
-        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
-        rt_se = rt_rq->tg->rt_se[cpu];
-        if (rt_rq->rt_nr_running) {
-                if (rt_se && !on_rt_rq(rt_se))
-                        enqueue_rt_entity(rt_se, false);
-                if (rt_rq->highest_prio.curr < curr->prio)
-                        resched_task(curr);
-        }
-}
-static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
-{
-        struct sched_rt_entity *rt_se;
-        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
-        rt_se = rt_rq->tg->rt_se[cpu];
-        if (rt_se && on_rt_rq(rt_se))
-                dequeue_rt_entity(rt_se);
-}
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
-}
-static int rt_se_boosted(struct sched_rt_entity *rt_se)
-{
-        struct rt_rq *rt_rq = group_rt_rq(rt_se);
-        struct task_struct *p;
-        if (rt_rq)
-                return !!rt_rq->rt_nr_boosted;
-        p = rt_task_of(rt_se);
-        return p->prio != p->normal_prio;
-}
-#ifdef CONFIG_SMP
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
-        return cpu_rq(smp_processor_id())->rd->span;
-}
-#else
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
-        return cpu_online_mask;
-}
-#endif
-static inline
-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
-{
-        return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
-}
-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
-{
-        return &rt_rq->tg->rt_bandwidth;
-}
-#else /* !CONFIG_RT_GROUP_SCHED */
-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
-{
-        return rt_rq->rt_runtime;
-}
-static inline u64 sched_rt_period(struct rt_rq *rt_rq)
-{
-        return ktime_to_ns(def_rt_bandwidth.rt_period);
-}
-typedef struct rt_rq *rt_rq_iter_t;
-#define for_each_rt_rq(rt_rq, iter, rq) \
-        for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-#define for_each_leaf_rt_rq(rt_rq, rq) \
-        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-#define for_each_sched_rt_entity(rt_se) \
-        for (; rt_se; rt_se = NULL)
-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
-{
-        return NULL;
-}
-static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
-{
-        if (rt_rq->rt_nr_running)
-                resched_task(rq_of_rt_rq(rt_rq)->curr);
-}
-static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
-{
-}
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-        return rt_rq->rt_throttled;
-}
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
-        return cpu_online_mask;
-}
-static inline
-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
-{
-        return &cpu_rq(cpu)->rt;
-}
-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
-{
-        return &def_rt_bandwidth;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_SMP
-/*
- * We ran out of runtime, see if we can borrow some from our neighbours.
- */
-static int do_balance_runtime(struct rt_rq *rt_rq)
-{
-        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
-        int i, weight, more = 0;
-        u64 rt_period;
-        weight = cpumask_weight(rd->span);
-        raw_spin_lock(&rt_b->rt_runtime_lock);
-        rt_period = ktime_to_ns(rt_b->rt_period);
-        for_each_cpu(i, rd->span) {
-                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
-                s64 diff;
-                if (iter == rt_rq)
-                        continue;
-                raw_spin_lock(&iter->rt_runtime_lock);
-                /*
-                 * Either all rqs have inf runtime and there's nothing to steal
-                 * or __disable_runtime() below sets a specific rq to inf to
-                 * indicate its been disabled and disalow stealing.
-                 */
-                if (iter->rt_runtime == RUNTIME_INF)
-                        goto next;
-                /*
-                 * From runqueues with spare time, take 1/n part of their
-                 * spare time, but no more than our period.
-                 */
-                diff = iter->rt_runtime - iter->rt_time;
-                if (diff > 0) {
-                        diff = div_u64((u64)diff, weight);
-                        if (rt_rq->rt_runtime + diff > rt_period)
-                                diff = rt_period - rt_rq->rt_runtime;
-                        iter->rt_runtime -= diff;
-                        rt_rq->rt_runtime += diff;
-                        more = 1;
-                        if (rt_rq->rt_runtime == rt_period) {
-                                raw_spin_unlock(&iter->rt_runtime_lock);
-                                break;
-                        }
-                }
-next:
-                raw_spin_unlock(&iter->rt_runtime_lock);
-        }
-        raw_spin_unlock(&rt_b->rt_runtime_lock);
-        return more;
-}
-/*
- * Ensure this RQ takes back all the runtime it lend to its neighbours.
- */
-static void __disable_runtime(struct rq *rq)
-{
-        struct root_domain *rd = rq->rd;
-        rt_rq_iter_t iter;
-        struct rt_rq *rt_rq;
-        if (unlikely(!scheduler_running))
-                return;
-        for_each_rt_rq(rt_rq, iter, rq) {
-                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-                s64 want;
-                int i;
-                raw_spin_lock(&rt_b->rt_runtime_lock);
-                raw_spin_lock(&rt_rq->rt_runtime_lock);
-                /*
-                 * Either we're all inf and nobody needs to borrow, or we're
-                 * already disabled and thus have nothing to do, or we have
-                 * exactly the right amount of runtime to take out.
-                 */
-                if (rt_rq->rt_runtime == RUNTIME_INF ||
-                                rt_rq->rt_runtime == rt_b->rt_runtime)
-                        goto balanced;
-                raw_spin_unlock(&rt_rq->rt_runtime_lock);
-                /*
-                 * Calculate the difference between what we started out with
-                 * and what we current have, that's the amount of runtime
-                 * we lend and now have to reclaim.
-                 */
-                want = rt_b->rt_runtime - rt_rq->rt_runtime;
-                /*
-                 * Greedy reclaim, take back as much as we can.
-                 */
-                for_each_cpu(i, rd->span) {
-                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
-                        s64 diff;
-                        /*
-                         * Can't reclaim from ourselves or disabled runqueues.
-                         */
-                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
-                                continue;
-                        raw_spin_lock(&iter->rt_runtime_lock);
-                        if (want > 0) {
-                                diff = min_t(s64, iter->rt_runtime, want);
-                                iter->rt_runtime -= diff;
-                                want -= diff;
-                        } else {
-                                iter->rt_runtime -= want;
-                                want -= want;
-                        }
-                        raw_spin_unlock(&iter->rt_runtime_lock);
-                        if (!want)
-                                break;
-                }
-                raw_spin_lock(&rt_rq->rt_runtime_lock);
-                /*
-                 * We cannot be left wanting - that would mean some runtime
-                 * leaked out of the system.
-                 */
-                BUG_ON(want);
-balanced:
-                /*
-                 * Disable all the borrow logic by pretending we have inf
-                 * runtime - in which case borrowing doesn't make sense.
-                 */
-                rt_rq->rt_runtime = RUNTIME_INF;
-                rt_rq->rt_throttled = 0;
-                raw_spin_unlock(&rt_rq->rt_runtime_lock);
-                raw_spin_unlock(&rt_b->rt_runtime_lock);
-        }
-}
-static void disable_runtime(struct rq *rq)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        __disable_runtime(rq);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-static void __enable_runtime(struct rq *rq)
-{
-        rt_rq_iter_t iter;
-        struct rt_rq *rt_rq;
-        if (unlikely(!scheduler_running))
-                return;
-        /*
-         * Reset each runqueue's bandwidth settings
-         */
-        for_each_rt_rq(rt_rq, iter, rq) {
-                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-                raw_spin_lock(&rt_b->rt_runtime_lock);
-                raw_spin_lock(&rt_rq->rt_runtime_lock);
-                rt_rq->rt_runtime = rt_b->rt_runtime;
-                rt_rq->rt_time = 0;
-                rt_rq->rt_throttled = 0;
-                raw_spin_unlock(&rt_rq->rt_runtime_lock);
-                raw_spin_unlock(&rt_b->rt_runtime_lock);
-        }
-}
-static void enable_runtime(struct rq *rq)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        __enable_runtime(rq);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-        int cpu = (int)(long)hcpu;
-        switch (action) {
-        case CPU_DOWN_PREPARE:
-        case CPU_DOWN_PREPARE_FROZEN:
-                disable_runtime(cpu_rq(cpu));
-                return NOTIFY_OK;
-        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                enable_runtime(cpu_rq(cpu));
-                return NOTIFY_OK;
-        default:
-                return NOTIFY_DONE;
-        }
-}
-static int balance_runtime(struct rt_rq *rt_rq)
-{
-        int more = 0;
-        if (!sched_feat(RT_RUNTIME_SHARE))
-                return more;
-        if (rt_rq->rt_time > rt_rq->rt_runtime) {
-                raw_spin_unlock(&rt_rq->rt_runtime_lock);
-                more = do_balance_runtime(rt_rq);
-                raw_spin_lock(&rt_rq->rt_runtime_lock);
-        }
-        return more;
-}
-#else /* !CONFIG_SMP */
-static inline int balance_runtime(struct rt_rq *rt_rq)
-{
-        return 0;
-}
-#endif /* CONFIG_SMP */
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
-{
-        int i, idle = 1, throttled = 0;
-        const struct cpumask *span;
-        span = sched_rt_period_mask();
-#ifdef CONFIG_RT_GROUP_SCHED
-        /*
-         * FIXME: isolated CPUs should really leave the root task group,
-         * whether they are isolcpus or were isolated via cpusets, lest
-         * the timer run on a CPU which does not service all runqueues,
-         * potentially leaving other CPUs indefinitely throttled.  If
-         * isolation is really required, the user will turn the throttle
-         * off to kill the perturbations it causes anyway.  Meanwhile,
-         * this maintains functionality for boot and/or troubleshooting.
-         */
-        if (rt_b == &root_task_group.rt_bandwidth)
-                span = cpu_online_mask;
-#endif
-        for_each_cpu(i, span) {
-                int enqueue = 0;
-                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
-                struct rq *rq = rq_of_rt_rq(rt_rq);
-                raw_spin_lock(&rq->lock);
-                if (rt_rq->rt_time) {
-                        u64 runtime;
-                        raw_spin_lock(&rt_rq->rt_runtime_lock);
-                        if (rt_rq->rt_throttled)
-                                balance_runtime(rt_rq);
-                        runtime = rt_rq->rt_runtime;
-                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
-                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
-                                rt_rq->rt_throttled = 0;
-                                enqueue = 1;
-                                /*
-                                 * Force a clock update if the CPU was idle,
-                                 * lest wakeup -> unthrottle time accumulate.
-                                 */
-                                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
-                                        rq->skip_clock_update = -1;
-                        }
-                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
-                                idle = 0;
-                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
-                } else if (rt_rq->rt_nr_running) {
-                        idle = 0;
-                        if (!rt_rq_throttled(rt_rq))
-                                enqueue = 1;
-                }
-                if (rt_rq->rt_throttled)
-                        throttled = 1;
-                if (enqueue)
-                        sched_rt_rq_enqueue(rt_rq);
-                raw_spin_unlock(&rq->lock);
-        }
-        if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
-                return 1;
-        return idle;
-}
-static inline int rt_se_prio(struct sched_rt_entity *rt_se)
-{
-#ifdef CONFIG_RT_GROUP_SCHED
-        struct rt_rq *rt_rq = group_rt_rq(rt_se);
-        if (rt_rq)
-                return rt_rq->highest_prio.curr;
-#endif
-        return rt_task_of(rt_se)->prio;
-}
-static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
-{
-        u64 runtime = sched_rt_runtime(rt_rq);
-        if (rt_rq->rt_throttled)
-                return rt_rq_throttled(rt_rq);
-        if (runtime >= sched_rt_period(rt_rq))
-                return 0;
-        balance_runtime(rt_rq);
-        runtime = sched_rt_runtime(rt_rq);
-        if (runtime == RUNTIME_INF)
-                return 0;
-        if (rt_rq->rt_time > runtime) {
-                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-                /*
-                 * Don't actually throttle groups that have no runtime assigned
-                 * but accrue some time due to boosting.
-                 */
-                if (likely(rt_b->rt_runtime)) {
-                        static bool once = false;
-                        rt_rq->rt_throttled = 1;
-                        if (!once) {
-                                once = true;
-                                printk_sched("sched: RT throttling activated\n");
-                        }
-                } else {
-                        /*
-                         * In case we did anyway, make it go away,
-                         * replenishment is a joke, since it will replenish us
-                         * with exactly 0 ns.
-                         */
-                        rt_rq->rt_time = 0;
-                }
-                if (rt_rq_throttled(rt_rq)) {
-                        sched_rt_rq_dequeue(rt_rq);
-                        return 1;
-                }
-        }
-        return 0;
-}
-/*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
- */
-static void update_curr_rt(struct rq *rq)
-{
-        struct task_struct *curr = rq->curr;
-        struct sched_rt_entity *rt_se = &curr->rt;
-        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
-        u64 delta_exec;
-        if (curr->sched_class != &rt_sched_class)
-                return;
-        delta_exec = rq->clock_task - curr->se.exec_start;
-        if (unlikely((s64)delta_exec < 0))
-                delta_exec = 0;
-        schedstat_set(curr->se.statistics.exec_max,
-                      max(curr->se.statistics.exec_max, delta_exec));
-        curr->se.sum_exec_runtime += delta_exec;
-        account_group_exec_runtime(curr, delta_exec);
-        curr->se.exec_start = rq->clock_task;
-        cpuacct_charge(curr, delta_exec);
-        sched_rt_avg_update(rq, delta_exec);
-        if (!rt_bandwidth_enabled())
-                return;
-        for_each_sched_rt_entity(rt_se) {
-                rt_rq = rt_rq_of_se(rt_se);
-                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
-                        raw_spin_lock(&rt_rq->rt_runtime_lock);
-                        rt_rq->rt_time += delta_exec;
-                        if (sched_rt_runtime_exceeded(rt_rq))
-                                resched_task(curr);
-                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
-                }
-        }
-}
-#if defined CONFIG_SMP
-static void
-inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
-{
-        struct rq *rq = rq_of_rt_rq(rt_rq);
-        if (rq->online && prio < prev_prio)
-                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
-}
-static void
-dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
-{
-        struct rq *rq = rq_of_rt_rq(rt_rq);
-        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
-                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
-}
-#else /* CONFIG_SMP */
-static inline
-void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
-static inline
-void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
-#endif /* CONFIG_SMP */
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-static void
-inc_rt_prio(struct rt_rq *rt_rq, int prio)
-{
-        int prev_prio = rt_rq->highest_prio.curr;
-        if (prio < prev_prio)
-                rt_rq->highest_prio.curr = prio;
-        inc_rt_prio_smp(rt_rq, prio, prev_prio);
-}
-static void
-dec_rt_prio(struct rt_rq *rt_rq, int prio)
-{
-        int prev_prio = rt_rq->highest_prio.curr;
-        if (rt_rq->rt_nr_running) {
-                WARN_ON(prio < prev_prio);
-                /*
-                 * This may have been our highest task, and therefore
-                 * we may have some recomputation to do
-                 */
-                if (prio == prev_prio) {
-                        struct rt_prio_array *array = &rt_rq->active;
-                        rt_rq->highest_prio.curr =
-                                sched_find_first_bit(array->bitmap);
-                }
-        } else
-                rt_rq->highest_prio.curr = MAX_RT_PRIO;
-        dec_rt_prio_smp(rt_rq, prio, prev_prio);
-}
-#else
-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
-#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
-static void
-inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-        if (rt_se_boosted(rt_se))
-                rt_rq->rt_nr_boosted++;
-        if (rt_rq->tg)
-                start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
-}
-static void
-dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-        if (rt_se_boosted(rt_se))
-                rt_rq->rt_nr_boosted--;
-        WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
-}
-#else /* CONFIG_RT_GROUP_SCHED */
-static void
-inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-        start_rt_bandwidth(&def_rt_bandwidth);
-}
-static inline
-void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
-#endif /* CONFIG_RT_GROUP_SCHED */
-static inline
-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-        int prio = rt_se_prio(rt_se);
-        WARN_ON(!rt_prio(prio));
-        rt_rq->rt_nr_running++;
-        inc_rt_prio(rt_rq, prio);
-        inc_rt_migration(rt_se, rt_rq);
-        inc_rt_group(rt_se, rt_rq);
-}
-static inline
-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-        WARN_ON(!rt_rq->rt_nr_running);
-        rt_rq->rt_nr_running--;
-        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
-        dec_rt_migration(rt_se, rt_rq);
-        dec_rt_group(rt_se, rt_rq);
-}
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
-{
-        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
-        struct rt_prio_array *array = &rt_rq->active;
-        struct rt_rq *group_rq = group_rt_rq(rt_se);
-        struct list_head *queue = array->queue + rt_se_prio(rt_se);
-        /*
-         * Don't enqueue the group if its throttled, or when empty.
-         * The latter is a consequence of the former when a child group
-         * get throttled and the current group doesn't have any other
-         * active members.
-         */
-        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
-                return;
-        if (!rt_rq->rt_nr_running)
-                list_add_leaf_rt_rq(rt_rq);
-        if (head)
-                list_add(&rt_se->run_list, queue);
-        else
-                list_add_tail(&rt_se->run_list, queue);
-        __set_bit(rt_se_prio(rt_se), array->bitmap);
-        inc_rt_tasks(rt_se, rt_rq);
-}
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
-{
-        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
-        struct rt_prio_array *array = &rt_rq->active;
-        list_del_init(&rt_se->run_list);
-        if (list_empty(array->queue + rt_se_prio(rt_se)))
-                __clear_bit(rt_se_prio(rt_se), array->bitmap);
-        dec_rt_tasks(rt_se, rt_rq);
-        if (!rt_rq->rt_nr_running)
-                list_del_leaf_rt_rq(rt_rq);
-}
-/*
- * Because the prio of an upper entry depends on the lower
- * entries, we must remove entries top - down.
- */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
-{
-        struct sched_rt_entity *back = NULL;
-        for_each_sched_rt_entity(rt_se) {
-                rt_se->back = back;
-                back = rt_se;
-        }
-        for (rt_se = back; rt_se; rt_se = rt_se->back) {
-                if (on_rt_rq(rt_se))
-                        __dequeue_rt_entity(rt_se);
-        }
-}
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
-{
-        dequeue_rt_stack(rt_se);
-        for_each_sched_rt_entity(rt_se)
-                __enqueue_rt_entity(rt_se, head);
-}
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
-{
-        dequeue_rt_stack(rt_se);
-        for_each_sched_rt_entity(rt_se) {
-                struct rt_rq *rt_rq = group_rt_rq(rt_se);
-                if (rt_rq && rt_rq->rt_nr_running)
-                        __enqueue_rt_entity(rt_se, false);
-        }
-}
-/*
- * Adding/removing a task to/from a priority array:
- */
-static void
-enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
-{
-        struct sched_rt_entity *rt_se = &p->rt;
-        if (flags & ENQUEUE_WAKEUP)
-                rt_se->timeout = 0;
-        enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
-        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
-                enqueue_pushable_task(rq, p);
-        inc_nr_running(rq);
-}
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
-{
-        struct sched_rt_entity *rt_se = &p->rt;
-        update_curr_rt(rq);
-        dequeue_rt_entity(rt_se);
-        dequeue_pushable_task(rq, p);
-        dec_nr_running(rq);
-}
-/*
- * Put task to the head or the end of the run list without the overhead of
- * dequeue followed by enqueue.
- */
-static void
-requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
-{
-        if (on_rt_rq(rt_se)) {
-                struct rt_prio_array *array = &rt_rq->active;
-                struct list_head *queue = array->queue + rt_se_prio(rt_se);
-                if (head)
-                        list_move(&rt_se->run_list, queue);
-                else
-                        list_move_tail(&rt_se->run_list, queue);
-        }
-}
-static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
-{
-        struct sched_rt_entity *rt_se = &p->rt;
-        struct rt_rq *rt_rq;
-        for_each_sched_rt_entity(rt_se) {
-                rt_rq = rt_rq_of_se(rt_se);
-                requeue_rt_entity(rt_rq, rt_se, head);
-        }
-}
-static void yield_task_rt(struct rq *rq)
-{
-        requeue_task_rt(rq, rq->curr, 0);
-}
-#ifdef CONFIG_SMP
-static int find_lowest_rq(struct task_struct *task);
-static int
-select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
-{
-        struct task_struct *curr;
-        struct rq *rq;
-        int cpu;
-        cpu = task_cpu(p);
-        if (p->nr_cpus_allowed == 1)
-                goto out;
-        /* For anything but wake ups, just return the task_cpu */
-        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
-                goto out;
-        rq = cpu_rq(cpu);
-        rcu_read_lock();
-        curr = ACCESS_ONCE(rq->curr); /* unlocked access */
-        /*
-         * If the current task on @p's runqueue is an RT task, then
-         * try to see if we can wake this RT task up on another
-         * runqueue. Otherwise simply start this RT task
-         * on its current runqueue.
-         *
-         * We want to avoid overloading runqueues. If the woken
-         * task is a higher priority, then it will stay on this CPU
-         * and the lower prio task should be moved to another CPU.
-         * Even though this will probably make the lower prio task
-         * lose its cache, we do not want to bounce a higher task
-         * around just because it gave up its CPU, perhaps for a
-         * lock?
-         *
-         * For equal prio tasks, we just let the scheduler sort it out.
-         *
-         * Otherwise, just let it ride on the affined RQ and the
-         * post-schedule router will push the preempted task away
-         *
-         * This test is optimistic, if we get it wrong the load-balancer
-         * will have to sort it out.
-         */
-        if (curr && unlikely(rt_task(curr)) &&
-            (curr->nr_cpus_allowed < 2 ||
-             curr->prio <= p->prio) &&
-            (p->nr_cpus_allowed > 1)) {
-                int target = find_lowest_rq(p);
-                if (target != -1)
-                        cpu = target;
-        }
-        rcu_read_unlock();
-out:
-        return cpu;
-}
-static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
-{
-        if (rq->curr->nr_cpus_allowed == 1)
-                return;
-        if (p->nr_cpus_allowed != 1
-            && cpupri_find(&rq->rd->cpupri, p, NULL))
-                return;
-        if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
-                return;
-        /*
-         * There appears to be other cpus that can accept
-         * current and none to run 'p', so lets reschedule
-         * to try and push current away:
-         */
-        requeue_task_rt(rq, p, 1);
-        resched_task(rq->curr);
-}
-#endif /* CONFIG_SMP */
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
-{
-        if (p->prio < rq->curr->prio) {
-                resched_task(rq->curr);
-                return;
-        }
-#ifdef CONFIG_SMP
-        /*
-         * If:
-         *
-         * - the newly woken task is of equal priority to the current task
-         * - the newly woken task is non-migratable while current is migratable
-         * - current will be preempted on the next reschedule
-         *
-         * we should check to see if current can readily move to a different
-         * cpu.  If so, we will reschedule to allow the push logic to try
-         * to move current somewhere else, making room for our non-migratable
-         * task.
-         */
-        if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
-                check_preempt_equal_prio(rq, p);
-#endif
-}
-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
-                                                   struct rt_rq *rt_rq)
-{
-        struct rt_prio_array *array = &rt_rq->active;
-        struct sched_rt_entity *next = NULL;
-        struct list_head *queue;
-        int idx;
-        idx = sched_find_first_bit(array->bitmap);
-        BUG_ON(idx >= MAX_RT_PRIO);
-        queue = array->queue + idx;
-        next = list_entry(queue->next, struct sched_rt_entity, run_list);
-        return next;
-}
-static struct task_struct *_pick_next_task_rt(struct rq *rq)
-{
-        struct sched_rt_entity *rt_se;
-        struct task_struct *p;
-        struct rt_rq *rt_rq;
-        rt_rq = &rq->rt;
-        if (!rt_rq->rt_nr_running)
-                return NULL;
-        if (rt_rq_throttled(rt_rq))
-                return NULL;
-        do {
-                rt_se = pick_next_rt_entity(rq, rt_rq);
-                BUG_ON(!rt_se);
-                rt_rq = group_rt_rq(rt_se);
-        } while (rt_rq);
-        p = rt_task_of(rt_se);
-        p->se.exec_start = rq->clock_task;
-        return p;
-}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
-{
-        struct task_struct *p = _pick_next_task_rt(rq);
-        /* The running task is never eligible for pushing */
-        if (p)
-                dequeue_pushable_task(rq, p);
-#ifdef CONFIG_SMP
-        /*
-         * We detect this state here so that we can avoid taking the RQ
-         * lock again later if there is no need to push
-         */
-        rq->post_schedule = has_pushable_tasks(rq);
-#endif
-        return p;
-}
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
-{
-        update_curr_rt(rq);
-        /*
-         * The previous task needs to be made eligible for pushing
-         * if it is still active
-         */
-        if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
-                enqueue_pushable_task(rq, p);
-}
-#ifdef CONFIG_SMP
-/* Only try algorithms three times */
-#define RT_MAX_TRIES 3
-static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
-{
-        if (!task_running(rq, p) &&
-            (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
-            (p->nr_cpus_allowed > 1))
-                return 1;
-        return 0;
-}
-/* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
-{
-        struct task_struct *next = NULL;
-        struct sched_rt_entity *rt_se;
-        struct rt_prio_array *array;
-        struct rt_rq *rt_rq;
-        int idx;
-        for_each_leaf_rt_rq(rt_rq, rq) {
-                array = &rt_rq->active;
-                idx = sched_find_first_bit(array->bitmap);
-next_idx:
-                if (idx >= MAX_RT_PRIO)
-                        continue;
-                if (next && next->prio <= idx)
-                        continue;
-                list_for_each_entry(rt_se, array->queue + idx, run_list) {
-                        struct task_struct *p;
-                        if (!rt_entity_is_task(rt_se))
-                                continue;
-                        p = rt_task_of(rt_se);
-                        if (pick_rt_task(rq, p, cpu)) {
-                                next = p;
-                                break;
-                        }
-                }
-                if (!next) {
-                        idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-                        goto next_idx;
-                }
-        }
-        return next;
-}
-static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
-static int find_lowest_rq(struct task_struct *task)
-{
-        struct sched_domain *sd;
-        struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
-        int this_cpu = smp_processor_id();
-        int cpu      = task_cpu(task);
-        /* Make sure the mask is initialized first */
-        if (unlikely(!lowest_mask))
-                return -1;
-        if (task->nr_cpus_allowed == 1)
-                return -1; /* No other targets possible */
-        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
-                return -1; /* No targets found */
-        /*
-         * At this point we have built a mask of cpus representing the
-         * lowest priority tasks in the system.  Now we want to elect
-         * the best one based on our affinity and topology.
-         *
-         * We prioritize the last cpu that the task executed on since
-         * it is most likely cache-hot in that location.
-         */
-        if (cpumask_test_cpu(cpu, lowest_mask))
-                return cpu;
-        /*
-         * Otherwise, we consult the sched_domains span maps to figure
-         * out which cpu is logically closest to our hot cache data.
-         */
-        if (!cpumask_test_cpu(this_cpu, lowest_mask))
-                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
-        rcu_read_lock();
-        for_each_domain(cpu, sd) {
-                if (sd->flags & SD_WAKE_AFFINE) {
-                        int best_cpu;
-                        /*
-                         * "this_cpu" is cheaper to preempt than a
-                         * remote processor.
-                         */
-                        if (this_cpu != -1 &&
-                            cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
-                                rcu_read_unlock();
-                                return this_cpu;
-                        }
-                        best_cpu = cpumask_first_and(lowest_mask,
-                                                     sched_domain_span(sd));
-                        if (best_cpu < nr_cpu_ids) {
-                                rcu_read_unlock();
-                                return best_cpu;
-                        }
-                }
-        }
-        rcu_read_unlock();
-        /*
-         * And finally, if there were no matches within the domains
-         * just give the caller *something* to work with from the compatible
-         * locations.
-         */
-        if (this_cpu != -1)
-                return this_cpu;
-        cpu = cpumask_any(lowest_mask);
-        if (cpu < nr_cpu_ids)
-                return cpu;
-        return -1;
-}
-/* Will lock the rq it finds */
-static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
-{
-        struct rq *lowest_rq = NULL;
-        int tries;
-        int cpu;
-        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
-                cpu = find_lowest_rq(task);
-                if ((cpu == -1) || (cpu == rq->cpu))
-                        break;
-                lowest_rq = cpu_rq(cpu);
-                /* if the prio of this runqueue changed, try again */
-                if (double_lock_balance(rq, lowest_rq)) {
-                        /*
-                         * We had to unlock the run queue. In
-                         * the mean time, task could have
-                         * migrated already or had its affinity changed.
-                         * Also make sure that it wasn't scheduled on its rq.
-                         */
-                        if (unlikely(task_rq(task) != rq ||
-                                     !cpumask_test_cpu(lowest_rq->cpu,
-                                                       tsk_cpus_allowed(task)) ||
-                                     task_running(rq, task) ||
-                                     !task->on_rq)) {
-                                double_unlock_balance(rq, lowest_rq);
-                                lowest_rq = NULL;
-                                break;
-                        }
-                }
-                /* If this rq is still suitable use it. */
-                if (lowest_rq->rt.highest_prio.curr > task->prio)
-                        break;
-                /* try again */
-                double_unlock_balance(rq, lowest_rq);
-                lowest_rq = NULL;
-        }
-        return lowest_rq;
-}
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
-        struct task_struct *p;
-        if (!has_pushable_tasks(rq))
-                return NULL;
-        p = plist_first_entry(&rq->rt.pushable_tasks,
-                              struct task_struct, pushable_tasks);
-        BUG_ON(rq->cpu != task_cpu(p));
-        BUG_ON(task_current(rq, p));
-        BUG_ON(p->nr_cpus_allowed <= 1);
-        BUG_ON(!p->on_rq);
-        BUG_ON(!rt_task(p));
-        return p;
-}
-/*
- * If the current CPU has more than one RT task, see if the non
- * running task can migrate over to a CPU that is running a task
- * of lesser priority.
- */
-static int push_rt_task(struct rq *rq)
-{
-        struct task_struct *next_task;
-        struct rq *lowest_rq;
-        int ret = 0;
-        if (!rq->rt.overloaded)
-                return 0;
-        next_task = pick_next_pushable_task(rq);
-        if (!next_task)
-                return 0;
-retry:
-        if (unlikely(next_task == rq->curr)) {
-                WARN_ON(1);
-                return 0;
-        }
-        /*
-         * It's possible that the next_task slipped in of
-         * higher priority than current. If that's the case
-         * just reschedule current.
-         */
-        if (unlikely(next_task->prio < rq->curr->prio)) {
-                resched_task(rq->curr);
-                return 0;
-        }
-        /* We might release rq lock */
-        get_task_struct(next_task);
-        /* find_lock_lowest_rq locks the rq if found */
-        lowest_rq = find_lock_lowest_rq(next_task, rq);
-        if (!lowest_rq) {
-                struct task_struct *task;
-                /*
-                 * find_lock_lowest_rq releases rq->lock
-                 * so it is possible that next_task has migrated.
-                 *
-                 * We need to make sure that the task is still on the same
-                 * run-queue and is also still the next task eligible for
-                 * pushing.
-                 */
-                task = pick_next_pushable_task(rq);
-                if (task_cpu(next_task) == rq->cpu && task == next_task) {
-                        /*
-                         * The task hasn't migrated, and is still the next
-                         * eligible task, but we failed to find a run-queue
-                         * to push it to.  Do not retry in this case, since
-                         * other cpus will pull from us when ready.
-                         */
-                        goto out;
-                }
-                if (!task)
-                        /* No more tasks, just exit */
-                        goto out;
-                /*
-                 * Something has shifted, try again.
-                 */
-                put_task_struct(next_task);
-                next_task = task;
-                goto retry;
-        }
-        deactivate_task(rq, next_task, 0);
-        set_task_cpu(next_task, lowest_rq->cpu);
-        activate_task(lowest_rq, next_task, 0);
-        ret = 1;
-        resched_task(lowest_rq->curr);
-        double_unlock_balance(rq, lowest_rq);
-out:
-        put_task_struct(next_task);
-        return ret;
-}
-static void push_rt_tasks(struct rq *rq)
-{
-        /* push_rt_task will return true if it moved an RT */
-        while (push_rt_task(rq))
-                ;
-}
-static int pull_rt_task(struct rq *this_rq)
-{
-        int this_cpu = this_rq->cpu, ret = 0, cpu;
-        struct task_struct *p;
-        struct rq *src_rq;
-        if (likely(!rt_overloaded(this_rq)))
-                return 0;
-        for_each_cpu(cpu, this_rq->rd->rto_mask) {
-                if (this_cpu == cpu)
-                        continue;
-                src_rq = cpu_rq(cpu);
-                /*
-                 * Don't bother taking the src_rq->lock if the next highest
-                 * task is known to be lower-priority than our current task.
-                 * This may look racy, but if this value is about to go
-                 * logically higher, the src_rq will push this task away.
-                 * And if its going logically lower, we do not care
-                 */
-                if (src_rq->rt.highest_prio.next >=
-                    this_rq->rt.highest_prio.curr)
-                        continue;
-                /*
-                 * We can potentially drop this_rq's lock in
-                 * double_lock_balance, and another CPU could
-                 * alter this_rq
-                 */
-                double_lock_balance(this_rq, src_rq);
-                /*
-                 * Are there still pullable RT tasks?
-                 */
-                if (src_rq->rt.rt_nr_running <= 1)
-                        goto skip;
-                p = pick_next_highest_task_rt(src_rq, this_cpu);
-                /*
-                 * Do we have an RT task that preempts
-                 * the to-be-scheduled task?
-                 */
-                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
-                        WARN_ON(p == src_rq->curr);
-                        WARN_ON(!p->on_rq);
-                        /*
-                         * There's a chance that p is higher in priority
-                         * than what's currently running on its cpu.
-                         * This is just that p is wakeing up and hasn't
-                         * had a chance to schedule. We only pull
-                         * p if it is lower in priority than the
-                         * current task on the run queue
-                         */
-                        if (p->prio < src_rq->curr->prio)
-                                goto skip;
-                        ret = 1;
-                        deactivate_task(src_rq, p, 0);
-                        set_task_cpu(p, this_cpu);
-                        activate_task(this_rq, p, 0);
-                        /*
-                         * We continue with the search, just in
-                         * case there's an even higher prio task
-                         * in another runqueue. (low likelihood
-                         * but possible)
-                         */
-                }
-skip:
-                double_unlock_balance(this_rq, src_rq);
-        }
-        return ret;
-}
-static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
-{
-        /* Try to pull RT tasks here if we lower this rq's prio */
-        if (rq->rt.highest_prio.curr > prev->prio)
-                pull_rt_task(rq);
-}
-static void post_schedule_rt(struct rq *rq)
-{
-        push_rt_tasks(rq);
-}
-/*
- * If we are not running and we are not going to reschedule soon, we should
- * try to push tasks away now
- */
-static void task_woken_rt(struct rq *rq, struct task_struct *p)
-{
-        if (!task_running(rq, p) &&
-            !test_tsk_need_resched(rq->curr) &&
-            has_pushable_tasks(rq) &&
-            p->nr_cpus_allowed > 1 &&
-            rt_task(rq->curr) &&
-            (rq->curr->nr_cpus_allowed < 2 ||
-             rq->curr->prio <= p->prio))
-                push_rt_tasks(rq);
-}
-static void set_cpus_allowed_rt(struct task_struct *p,
-                                const struct cpumask *new_mask)
-{
-        struct rq *rq;
-        int weight;
-        BUG_ON(!rt_task(p));
-        if (!p->on_rq)
-                return;
-        weight = cpumask_weight(new_mask);
-        /*
-         * Only update if the process changes its state from whether it
-         * can migrate or not.
-         */
-        if ((p->nr_cpus_allowed > 1) == (weight > 1))
-                return;
-        rq = task_rq(p);
-        /*
-         * The process used to be able to migrate OR it can now migrate
-         */
-        if (weight <= 1) {
-                if (!task_current(rq, p))
-                        dequeue_pushable_task(rq, p);
-                BUG_ON(!rq->rt.rt_nr_migratory);
-                rq->rt.rt_nr_migratory--;
-        } else {
-                if (!task_current(rq, p))
-                        enqueue_pushable_task(rq, p);
-                rq->rt.rt_nr_migratory++;
-        }
-        update_rt_migration(&rq->rt);
-}
-/* Assumes rq->lock is held */
-static void rq_online_rt(struct rq *rq)
-{
-        if (rq->rt.overloaded)
-                rt_set_overload(rq);
-        __enable_runtime(rq);
-        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
-}
-/* Assumes rq->lock is held */
-static void rq_offline_rt(struct rq *rq)
-{
-        if (rq->rt.overloaded)
-                rt_clear_overload(rq);
-        __disable_runtime(rq);
-        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
-}
-/*
- * When switch from the rt queue, we bring ourselves to a position
- * that we might want to pull RT tasks from other runqueues.
- */
-static void switched_from_rt(struct rq *rq, struct task_struct *p)
-{
-        /*
-         * If there are other RT tasks then we will reschedule
-         * and the scheduling of the other RT tasks will handle
-         * the balancing. But if we are the last RT task
-         * we may need to handle the pulling of RT tasks
-         * now.
-         */
-        if (p->on_rq && !rq->rt.rt_nr_running)
-                pull_rt_task(rq);
-}
-void init_sched_rt_class(void)
-{
-        unsigned int i;
-        for_each_possible_cpu(i) {
-                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
-                                        GFP_KERNEL, cpu_to_node(i));
-        }
-}
-#endif /* CONFIG_SMP */
-/*
- * When switching a task to RT, we may overload the runqueue
- * with RT tasks. In this case we try to push them off to
- * other runqueues.
- */
-static void switched_to_rt(struct rq *rq, struct task_struct *p)
-{
-        int check_resched = 1;
-        /*
-         * If we are already running, then there's nothing
-         * that needs to be done. But if we are not running
-         * we may need to preempt the current running task.
-         * If that current running task is also an RT task
-         * then see if we can move to another run queue.
-         */
-        if (p->on_rq && rq->curr != p) {
-#ifdef CONFIG_SMP
-                if (rq->rt.overloaded && push_rt_task(rq) &&
-                    /* Don't resched if we changed runqueues */
-                    rq != task_rq(p))
-                        check_resched = 0;
-#endif /* CONFIG_SMP */
-                if (check_resched && p->prio < rq->curr->prio)
-                        resched_task(rq->curr);
-        }
-}
-/*
- * Priority of the task has changed. This may cause
- * us to initiate a push or pull.
- */
-static void
-prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
-{
-        if (!p->on_rq)
-                return;
-        if (rq->curr == p) {
-#ifdef CONFIG_SMP
-                /*
-                 * If our priority decreases while running, we
-                 * may need to pull tasks to this runqueue.
-                 */
-                if (oldprio < p->prio)
-                        pull_rt_task(rq);
-                /*
-                 * If there's a higher priority task waiting to run
-                 * then reschedule. Note, the above pull_rt_task
-                 * can release the rq lock and p could migrate.
-                 * Only reschedule if p is still on the same runqueue.
-                 */
-                if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
-                        resched_task(p);
-#else
-                /* For UP simply resched on drop of prio */
-                if (oldprio < p->prio)
-                        resched_task(p);
-#endif /* CONFIG_SMP */
-        } else {
-                /*
-                 * This task is not running, but if it is
-                 * greater than the current running task
-                 * then reschedule.
-                 */
-                if (p->prio < rq->curr->prio)
-                        resched_task(rq->curr);
-        }
-}
-static void watchdog(struct rq *rq, struct task_struct *p)
-{
-        unsigned long soft, hard;
-        /* max may change after cur was read, this will be fixed next tick */
-        soft = task_rlimit(p, RLIMIT_RTTIME);
-        hard = task_rlimit_max(p, RLIMIT_RTTIME);
-        if (soft != RLIM_INFINITY) {
-                unsigned long next;
-                p->rt.timeout++;
-                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
-                if (p->rt.timeout > next)
-                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
-        }
-}
-static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
-{
-        struct sched_rt_entity *rt_se = &p->rt;
-        update_curr_rt(rq);
-        watchdog(rq, p);
-        /*
-         * RR tasks need a special form of timeslice management.
-         * FIFO tasks have no timeslices.
-         */
-        if (p->policy != SCHED_RR)
-                return;
-        if (--p->rt.time_slice)
-                return;
-        p->rt.time_slice = RR_TIMESLICE;
-        /*
-         * Requeue to the end of queue if we (and all of our ancestors) are the
-         * only element on the queue
-         */
-        for_each_sched_rt_entity(rt_se) {
-                if (rt_se->run_list.prev != rt_se->run_list.next) {
-                        requeue_task_rt(rq, p, 0);
-                        set_tsk_need_resched(p);
-                        return;
-                }
-        }
-}
-static void set_curr_task_rt(struct rq *rq)
-{
-        struct task_struct *p = rq->curr;
-        p->se.exec_start = rq->clock_task;
-        /* The running task is never eligible for pushing */
-        dequeue_pushable_task(rq, p);
-}
-static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
-{
-        /*
-         * Time slice is 0 for SCHED_FIFO tasks
-         */
-        if (task->policy == SCHED_RR)
-                return RR_TIMESLICE;
-        else
-                return 0;
-}
-const struct sched_class rt_sched_class = {
-        .next                   = &fair_sched_class,
-        .enqueue_task           = enqueue_task_rt,
-        .dequeue_task           = dequeue_task_rt,
-        .yield_task             = yield_task_rt,
-        .check_preempt_curr     = check_preempt_curr_rt,
-        .pick_next_task         = pick_next_task_rt,
-        .put_prev_task          = put_prev_task_rt,
-#ifdef CONFIG_SMP
-        .select_task_rq         = select_task_rq_rt,
-        .set_cpus_allowed       = set_cpus_allowed_rt,
-        .rq_online              = rq_online_rt,
-        .rq_offline             = rq_offline_rt,
-        .pre_schedule           = pre_schedule_rt,
-        .post_schedule          = post_schedule_rt,
-        .task_woken             = task_woken_rt,
-        .switched_from          = switched_from_rt,
-#endif
-        .set_curr_task          = set_curr_task_rt,
-        .task_tick              = task_tick_rt,
-        .get_rr_interval        = get_rr_interval_rt,
-        .prio_changed           = prio_changed_rt,
-        .switched_to            = switched_to_rt,
-};
-#ifdef CONFIG_SCHED_DEBUG
-extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
-void print_rt_stats(struct seq_file *m, int cpu)
-{
-        rt_rq_iter_t iter;
-        struct rt_rq *rt_rq;
-        rcu_read_lock();
-        for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
-                print_rt_rq(m, cpu, rt_rq);
-        rcu_read_unlock();
-}
-#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
deleted file mode 100644
index fc886441436..00000000000
--- a/kernel/sched/sched.h
+++ /dev/null
@@ -1,1241 +0,0 @@
-#include <linux/sched.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-#include <linux/stop_machine.h>
-#include "cpupri.h"
-extern __read_mostly int scheduler_running;
-/*
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
-/*
- * Helpers for converting nanosecond timing to jiffy resolution
- */
-#define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
-#define NICE_0_LOAD             SCHED_LOAD_SCALE
-#define NICE_0_SHIFT            SCHED_LOAD_SHIFT
-/*
- * These are the 'tuning knobs' of the scheduler:
- */
-/*
- * single value that denotes runtime == period, ie unlimited time.
- */
-#define RUNTIME_INF     ((u64)~0ULL)
-static inline int rt_policy(int policy)
-{
-        if (policy == SCHED_FIFO || policy == SCHED_RR)
-                return 1;
-        return 0;
-}
-static inline int task_has_rt_policy(struct task_struct *p)
-{
-        return rt_policy(p->policy);
-}
-/*
- * This is the priority-queue data structure of the RT scheduling class:
- */
-struct rt_prio_array {
-        DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
-        struct list_head queue[MAX_RT_PRIO];
-};
-struct rt_bandwidth {
-        /* nests inside the rq lock: */
-        raw_spinlock_t          rt_runtime_lock;
-        ktime_t                 rt_period;
-        u64                     rt_runtime;
-        struct hrtimer          rt_period_timer;
-};
-extern struct mutex sched_domains_mutex;
-#ifdef CONFIG_CGROUP_SCHED
-#include <linux/cgroup.h>
-struct cfs_rq;
-struct rt_rq;
-extern struct list_head task_groups;
-struct cfs_bandwidth {
-#ifdef CONFIG_CFS_BANDWIDTH
-        raw_spinlock_t lock;
-        ktime_t period;
-        u64 quota, runtime;
-        s64 hierarchal_quota;
-        u64 runtime_expires;
-        int idle, timer_active;
-        struct hrtimer period_timer, slack_timer;
-        struct list_head throttled_cfs_rq;
-        /* statistics */
-        int nr_periods, nr_throttled;
-        u64 throttled_time;
-#endif
-};
-/* task group related information */
-struct task_group {
-        struct cgroup_subsys_state css;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        /* schedulable entities of this group on each cpu */
-        struct sched_entity **se;
-        /* runqueue "owned" by this group on each cpu */
-        struct cfs_rq **cfs_rq;
-        unsigned long shares;
-        atomic_t load_weight;
-        atomic64_t load_avg;
-        atomic_t runnable_avg;
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        struct sched_rt_entity **rt_se;
-        struct rt_rq **rt_rq;
-        struct rt_bandwidth rt_bandwidth;
-#endif
-        struct rcu_head rcu;
-        struct list_head list;
-        struct task_group *parent;
-        struct list_head siblings;
-        struct list_head children;
-#ifdef CONFIG_SCHED_AUTOGROUP
-        struct autogroup *autogroup;
-#endif
-        struct cfs_bandwidth cfs_bandwidth;
-};
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#define ROOT_TASK_GROUP_LOAD    NICE_0_LOAD
-/*
- * A weight of 0 or 1 can cause arithmetics problems.
- * A weight of a cfs_rq is the sum of weights of which entities
- * are queued on this cfs_rq, so a weight of a entity should not be
- * too large, so as the shares value of a task group.
- * (The default weight is 1024 - so there's no practical
- *  limitation from this.)
- */
-#define MIN_SHARES      (1UL <<  1)
-#define MAX_SHARES      (1UL << 18)
-#endif
-/* Default task group.
- *      Every task in system belong to this group at bootup.
- */
-extern struct task_group root_task_group;
-typedef int (*tg_visitor)(struct task_group *, void *);
-extern int walk_tg_tree_from(struct task_group *from,
-                             tg_visitor down, tg_visitor up, void *data);
-/*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
- *
- * Caller must hold rcu_lock or sufficient equivalent.
- */
-static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
-{
-        return walk_tg_tree_from(&root_task_group, down, up, data);
-}
-extern int tg_nop(struct task_group *tg, void *data);
-extern void free_fair_sched_group(struct task_group *tg);
-extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
-extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
-extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-                        struct sched_entity *se, int cpu,
-                        struct sched_entity *parent);
-extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
-extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
-extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
-extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
-extern void free_rt_sched_group(struct task_group *tg);
-extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
-extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-                struct sched_rt_entity *rt_se, int cpu,
-                struct sched_rt_entity *parent);
-#else /* CONFIG_CGROUP_SCHED */
-struct cfs_bandwidth { };
-#endif  /* CONFIG_CGROUP_SCHED */
-/* CFS-related fields in a runqueue */
-struct cfs_rq {
-        struct load_weight load;
-        unsigned int nr_running, h_nr_running;
-        u64 exec_clock;
-        u64 min_vruntime;
-#ifndef CONFIG_64BIT
-        u64 min_vruntime_copy;
-#endif
-        struct rb_root tasks_timeline;
-        struct rb_node *rb_leftmost;
-        /*
-         * 'curr' points to currently running entity on this cfs_rq.
-         * It is set to NULL otherwise (i.e when none are currently running).
-         */
-        struct sched_entity *curr, *next, *last, *skip;
-#ifdef  CONFIG_SCHED_DEBUG
-        unsigned int nr_spread_over;
-#endif
-#ifdef CONFIG_SMP
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        /*
-         * CFS Load tracking
-         * Under CFS, load is tracked on a per-entity basis and aggregated up.
-         * This allows for the description of both thread and group usage (in
-         * the FAIR_GROUP_SCHED case).
-         */
-        u64 runnable_load_avg, blocked_load_avg;
-        atomic64_t decay_counter, removed_load;
-        u64 last_decay;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-/* These always depend on CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        u32 tg_runnable_contrib;
-        u64 tg_load_contrib;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-        /*
-         *   h_load = weight * f(tg)
-         *
-         * Where f(tg) is the recursive weight fraction assigned to
-         * this group.
-         */
-        unsigned long h_load;
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
-        /*
-         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
-         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
-         * (like users, containers etc.)
-         *
-         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
-         * list is used during load balance.
-         */
-        int on_list;
-        struct list_head leaf_cfs_rq_list;
-        struct task_group *tg;  /* group that "owns" this runqueue */
-#ifdef CONFIG_CFS_BANDWIDTH
-        int runtime_enabled;
-        u64 runtime_expires;
-        s64 runtime_remaining;
-        u64 throttled_clock, throttled_clock_task;
-        u64 throttled_clock_task_time;
-        int throttled, throttle_count;
-        struct list_head throttled_list;
-#endif /* CONFIG_CFS_BANDWIDTH */
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-};
-static inline int rt_bandwidth_enabled(void)
-{
-        return sysctl_sched_rt_runtime >= 0;
-}
-/* Real-Time classes' related field in a runqueue: */
-struct rt_rq {
-        struct rt_prio_array active;
-        unsigned int rt_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-        struct {
-                int curr; /* highest queued rt task prio */
-#ifdef CONFIG_SMP
-                int next; /* next highest */
-#endif
-        } highest_prio;
-#endif
-#ifdef CONFIG_SMP
-        unsigned long rt_nr_migratory;
-        unsigned long rt_nr_total;
-        int overloaded;
-        struct plist_head pushable_tasks;
-#endif
-        int rt_throttled;
-        u64 rt_time;
-        u64 rt_runtime;
-        /* Nests inside the rq lock: */
-        raw_spinlock_t rt_runtime_lock;
-#ifdef CONFIG_RT_GROUP_SCHED
-        unsigned long rt_nr_boosted;
-        struct rq *rq;
-        struct list_head leaf_rt_rq_list;
-        struct task_group *tg;
-#endif
-};
-#ifdef CONFIG_SMP
-/*
- * We add the notion of a root-domain which will be used to define per-domain
- * variables. Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset. Whenever a new
- * exclusive cpuset is created, we also create and attach a new root-domain
- * object.
- *
- */
-struct root_domain {
-        atomic_t refcount;
-        atomic_t rto_count;
-        struct rcu_head rcu;
-        cpumask_var_t span;
-        cpumask_var_t online;
-        /*
-         * The "RT overload" flag: it gets set if a CPU has more than
-         * one runnable RT task.
-         */
-        cpumask_var_t rto_mask;
-        struct cpupri cpupri;
-};
-extern struct root_domain def_root_domain;
-#endif /* CONFIG_SMP */
-/*
- * This is the main, per-CPU runqueue data structure.
- *
- * Locking rule: those places that want to lock multiple runqueues
- * (such as the load balancing or the thread migration code), lock
- * acquire operations must be ordered by ascending &runqueue.
- */
-struct rq {
-        /* runqueue lock: */
-        raw_spinlock_t lock;
-        /*
-         * nr_running and cpu_load should be in the same cacheline because
-         * remote CPUs use both these fields when doing load calculation.
-         */
-        unsigned int nr_running;
-        #define CPU_LOAD_IDX_MAX 5
-        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-        unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ
-        u64 nohz_stamp;
-        unsigned long nohz_flags;
-#endif
-        int skip_clock_update;
-        /* capture load from *all* tasks on this cpu: */
-        struct load_weight load;
-        unsigned long nr_load_updates;
-        u64 nr_switches;
-        struct cfs_rq cfs;
-        struct rt_rq rt;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        /* list of leaf cfs_rq on this cpu: */
-        struct list_head leaf_cfs_rq_list;
-#ifdef CONFIG_SMP
-        unsigned long h_load_throttle;
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
-        struct list_head leaf_rt_rq_list;
-#endif
-        /*
-         * This is part of a global counter where only the total sum
-         * over all CPUs matters. A task can increase this counter on
-         * one CPU and if it got migrated afterwards it may decrease
-         * it on another CPU. Always updated under the runqueue lock:
-         */
-        unsigned long nr_uninterruptible;
-        struct task_struct *curr, *idle, *stop;
-        unsigned long next_balance;
-        struct mm_struct *prev_mm;
-        u64 clock;
-        u64 clock_task;
-        atomic_t nr_iowait;
-#ifdef CONFIG_SMP
-        struct root_domain *rd;
-        struct sched_domain *sd;
-        unsigned long cpu_power;
-        unsigned char idle_balance;
-        /* For active balancing */
-        int post_schedule;
-        int active_balance;
-        int push_cpu;
-        struct cpu_stop_work active_balance_work;
-        /* cpu of this runqueue: */
-        int cpu;
-        int online;
-        struct list_head cfs_tasks;
-        u64 rt_avg;
-        u64 age_stamp;
-        u64 idle_stamp;
-        u64 avg_idle;
-#endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-        u64 prev_irq_time;
-#endif
-#ifdef CONFIG_PARAVIRT
-        u64 prev_steal_time;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-        u64 prev_steal_time_rq;
-#endif
-        /* calc_load related fields */
-        unsigned long calc_load_update;
-        long calc_load_active;
-#ifdef CONFIG_SCHED_HRTICK
-#ifdef CONFIG_SMP
-        int hrtick_csd_pending;
-        struct call_single_data hrtick_csd;
-#endif
-        struct hrtimer hrtick_timer;
-#endif
-#ifdef CONFIG_SCHEDSTATS
-        /* latency stats */
-        struct sched_info rq_sched_info;
-        unsigned long long rq_cpu_time;
-        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
-        /* sys_sched_yield() stats */
-        unsigned int yld_count;
-        /* schedule() stats */
-        unsigned int sched_count;
-        unsigned int sched_goidle;
-        /* try_to_wake_up() stats */
-        unsigned int ttwu_count;
-        unsigned int ttwu_local;
-#endif
-#ifdef CONFIG_SMP
-        struct llist_head wake_list;
-#endif
-        struct sched_avg avg;
-};
-static inline int cpu_of(struct rq *rq)
-{
-#ifdef CONFIG_SMP
-        return rq->cpu;
-#else
-        return 0;
-#endif
-}
-DECLARE_PER_CPU(struct rq, runqueues);
-#define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
-#define this_rq()               (&__get_cpu_var(runqueues))
-#define task_rq(p)              cpu_rq(task_cpu(p))
-#define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
-#define raw_rq()                (&__raw_get_cpu_var(runqueues))
-#ifdef CONFIG_SMP
-#define rcu_dereference_check_sched_domain(p) \
-        rcu_dereference_check((p), \
-                              lockdep_is_held(&sched_domains_mutex))
-/*
- * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See detach_destroy_domains: synchronize_sched for details.
- *
- * The domain tree of any CPU may only be accessed from within
- * preempt-disabled sections.
- */
-#define for_each_domain(cpu, __sd) \
-        for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
-                        __sd; __sd = __sd->parent)
-#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
-/**
- * highest_flag_domain - Return highest sched_domain containing flag.
- * @cpu:        The cpu whose highest level of sched domain is to
- *              be returned.
- * @flag:       The flag to check for the highest sched_domain
- *              for the given cpu.
- *
- * Returns the highest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
-{
-        struct sched_domain *sd, *hsd = NULL;
-        for_each_domain(cpu, sd) {
-                if (!(sd->flags & flag))
-                        break;
-                hsd = sd;
-        }
-        return hsd;
-}
-DECLARE_PER_CPU(struct sched_domain *, sd_llc);
-DECLARE_PER_CPU(int, sd_llc_id);
-extern int group_balance_cpu(struct sched_group *sg);
-#endif /* CONFIG_SMP */
-#include "stats.h"
-#include "auto_group.h"
-#ifdef CONFIG_CGROUP_SCHED
-/*
- * Return the group to which this tasks belongs.
- *
- * We cannot use task_subsys_state() and friends because the cgroup
- * subsystem changes that value before the cgroup_subsys::attach() method
- * is called, therefore we cannot pin it and might observe the wrong value.
- *
- * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
- * core changes this before calling sched_move_task().
- *
- * Instead we use a 'copy' which is updated from sched_move_task() while
- * holding both task_struct::pi_lock and rq::lock.
- */
-static inline struct task_group *task_group(struct task_struct *p)
-{
-        return p->sched_task_group;
-}
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
-{
-#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
-        struct task_group *tg = task_group(p);
-#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        p->se.cfs_rq = tg->cfs_rq[cpu];
-        p->se.parent = tg->se[cpu];
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        p->rt.rt_rq  = tg->rt_rq[cpu];
-        p->rt.parent = tg->rt_se[cpu];
-#endif
-}
-#else /* CONFIG_CGROUP_SCHED */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline struct task_group *task_group(struct task_struct *p)
-{
-        return NULL;
-}
-#endif /* CONFIG_CGROUP_SCHED */
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-        set_task_rq(p, cpu);
-#ifdef CONFIG_SMP
-        /*
-         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
-         * successfuly executed on another CPU. We must ensure that updates of
-         * per-task data have been completed by this moment.
-         */
-        smp_wmb();
-        task_thread_info(p)->cpu = cpu;
-#endif
-}
-/*
- * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
- */
-#ifdef CONFIG_SCHED_DEBUG
-# include <linux/static_key.h>
-# define const_debug __read_mostly
-#else
-# define const_debug const
-#endif
-extern const_debug unsigned int sysctl_sched_features;
-#define SCHED_FEAT(name, enabled)       \
-        __SCHED_FEAT_##name ,
-enum {
-#include "features.h"
-        __SCHED_FEAT_NR,
-};
-#undef SCHED_FEAT
-#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
-static __always_inline bool static_branch__true(struct static_key *key)
-{
-        return static_key_true(key); /* Not out of line branch. */
-}
-static __always_inline bool static_branch__false(struct static_key *key)
-{
-        return static_key_false(key); /* Out of line branch. */
-}
-#define SCHED_FEAT(name, enabled)                                       \
-static __always_inline bool static_branch_##name(struct static_key *key) \
-{                                                                       \
-        return static_branch__##enabled(key);                           \
-}
-#include "features.h"
-#undef SCHED_FEAT
-extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
-#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
-#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
-#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
-#ifdef CONFIG_NUMA_BALANCING
-#define sched_feat_numa(x) sched_feat(x)
-#ifdef CONFIG_SCHED_DEBUG
-#define numabalancing_enabled sched_feat_numa(NUMA)
-#else
-extern bool numabalancing_enabled;
-#endif /* CONFIG_SCHED_DEBUG */
-#else
-#define sched_feat_numa(x) (0)
-#define numabalancing_enabled (0)
-#endif /* CONFIG_NUMA_BALANCING */
-static inline u64 global_rt_period(void)
-{
-        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
-}
-static inline u64 global_rt_runtime(void)
-{
-        if (sysctl_sched_rt_runtime < 0)
-                return RUNTIME_INF;
-        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
-}
-static inline int task_current(struct rq *rq, struct task_struct *p)
-{
-        return rq->curr == p;
-}
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-        return p->on_cpu;
-#else
-        return task_current(rq, p);
-#endif
-}
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(next)      do { } while (0)
-#endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev)       do { } while (0)
-#endif
-#ifndef finish_arch_post_lock_switch
-# define finish_arch_post_lock_switch() do { } while (0)
-#endif
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
-        /*
-         * We can optimise this out completely for !SMP, because the
-         * SMP rebalancing from interrupt is the only thing that cares
-         * here.
-         */
-        next->on_cpu = 1;
-#endif
-}
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
-        /*
-         * After ->on_cpu is cleared, the task can be moved to a different CPU.
-         * We must ensure this doesn't happen until the switch is completely
-         * finished.
-         */
-        smp_wmb();
-        prev->on_cpu = 0;
-#endif
-#ifdef CONFIG_DEBUG_SPINLOCK
-        /* this is a valid case when another task releases the spinlock */
-        rq->lock.owner = current;
-#endif
-        /*
-         * If we are tracking spinlock dependencies then we have to
-         * fix up the runqueue lock - which gets 'carried over' from
-         * prev into current:
-         */
-        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-        raw_spin_unlock_irq(&rq->lock);
-}
-#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
-        /*
-         * We can optimise this out completely for !SMP, because the
-         * SMP rebalancing from interrupt is the only thing that cares
-         * here.
-         */
-        next->on_cpu = 1;
-#endif
-        raw_spin_unlock(&rq->lock);
-}
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
-        /*
-         * After ->on_cpu is cleared, the task can be moved to a different CPU.
-         * We must ensure this doesn't happen until the switch is completely
-         * finished.
-         */
-        smp_wmb();
-        prev->on_cpu = 0;
-#endif
-        local_irq_enable();
-}
-#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline void update_load_add(struct load_weight *lw, unsigned long inc)
-{
-        lw->weight += inc;
-        lw->inv_weight = 0;
-}
-static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
-{
-        lw->weight -= dec;
-        lw->inv_weight = 0;
-}
-static inline void update_load_set(struct load_weight *lw, unsigned long w)
-{
-        lw->weight = w;
-        lw->inv_weight = 0;
-}
-/*
- * To aid in avoiding the subversion of "niceness" due to uneven distribution
- * of tasks with abnormal "nice" values across CPUs the contribution that
- * each task makes to its run queue's load is weighted according to its
- * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
- * scaled version of the new time slice allocation that they receive on time
- * slice expiry etc.
- */
-#define WEIGHT_IDLEPRIO                3
-#define WMULT_IDLEPRIO         1431655765
-/*
- * Nice levels are multiplicative, with a gentle 10% change for every
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
- * that remained on nice 0.
- *
- * The "10% effect" is relative and cumulative: from _any_ nice level,
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- * If a task goes up by ~10% and another task goes down by ~10% then
- * the relative distance between them is ~25%.)
- */
-static const int prio_to_weight[40] = {
- /* -20 */     88761,     71755,     56483,     46273,     36291,
- /* -15 */     29154,     23254,     18705,     14949,     11916,
- /* -10 */      9548,      7620,      6100,      4904,      3906,
- /*  -5 */      3121,      2501,      1991,      1586,      1277,
- /*   0 */      1024,       820,       655,       526,       423,
- /*   5 */       335,       272,       215,       172,       137,
- /*  10 */       110,        87,        70,        56,        45,
- /*  15 */        36,        29,        23,        18,        15,
-};
-/*
- * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
- *
- * In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
- * into multiplications:
- */
-static const u32 prio_to_wmult[40] = {
- /* -20 */     48388,     59856,     76040,     92818,    118348,
- /* -15 */    147320,    184698,    229616,    287308,    360437,
- /* -10 */    449829,    563644,    704093,    875809,   1099582,
- /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
- /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
- /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
- /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
- /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
-};
-/* Time spent by the tasks of the cpu accounting group executing in ... */
-enum cpuacct_stat_index {
-        CPUACCT_STAT_USER,      /* ... user mode */
-        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
-        CPUACCT_STAT_NSTATS,
-};
-#define sched_class_highest (&stop_sched_class)
-#define for_each_class(class) \
-   for (class = sched_class_highest; class; class = class->next)
-extern const struct sched_class stop_sched_class;
-extern const struct sched_class rt_sched_class;
-extern const struct sched_class fair_sched_class;
-extern const struct sched_class idle_sched_class;
-#ifdef CONFIG_SMP
-extern void trigger_load_balance(struct rq *rq, int cpu);
-extern void idle_balance(int this_cpu, struct rq *this_rq);
-#else   /* CONFIG_SMP */
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
-#endif
-extern void sysrq_sched_debug_show(void);
-extern void sched_init_granularity(void);
-extern void update_max_interval(void);
-extern void update_group_power(struct sched_domain *sd, int cpu);
-extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
-extern void init_sched_rt_class(void);
-extern void init_sched_fair_class(void);
-extern void resched_task(struct task_struct *p);
-extern void resched_cpu(int cpu);
-extern struct rt_bandwidth def_rt_bandwidth;
-extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
-extern void update_idle_cpu_load(struct rq *this_rq);
-#ifdef CONFIG_CGROUP_CPUACCT
-#include <linux/cgroup.h>
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
-        struct cgroup_subsys_state css;
-        /* cpuusage holds pointer to a u64-type object on every cpu */
-        u64 __percpu *cpuusage;
-        struct kernel_cpustat __percpu *cpustat;
-};
-extern struct cgroup_subsys cpuacct_subsys;
-extern struct cpuacct root_cpuacct;
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
-        return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
-                            struct cpuacct, css);
-}
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
-        return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
-                            struct cpuacct, css);
-}
-static inline struct cpuacct *parent_ca(struct cpuacct *ca)
-{
-        if (!ca || !ca->css.cgroup->parent)
-                return NULL;
-        return cgroup_ca(ca->css.cgroup->parent);
-}
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-#else
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
-#endif
-#ifdef CONFIG_PARAVIRT
-static inline u64 steal_ticks(u64 steal)
-{
-        if (unlikely(steal > NSEC_PER_SEC))
-                return div_u64(steal, TICK_NSEC);
-        return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
-}
-#endif
-static inline void inc_nr_running(struct rq *rq)
-{
-        rq->nr_running++;
-}
-static inline void dec_nr_running(struct rq *rq)
-{
-        rq->nr_running--;
-}
-extern void update_rq_clock(struct rq *rq);
-extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
-extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
-extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
-extern const_debug unsigned int sysctl_sched_time_avg;
-extern const_debug unsigned int sysctl_sched_nr_migrate;
-extern const_debug unsigned int sysctl_sched_migration_cost;
-static inline u64 sched_avg_period(void)
-{
-        return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
-}
-#ifdef CONFIG_SCHED_HRTICK
-/*
- * Use hrtick when:
- *  - enabled by features
- *  - hrtimer is actually high res
- */
-static inline int hrtick_enabled(struct rq *rq)
-{
-        if (!sched_feat(HRTICK))
-                return 0;
-        if (!cpu_active(cpu_of(rq)))
-                return 0;
-        return hrtimer_is_hres_active(&rq->hrtick_timer);
-}
-void hrtick_start(struct rq *rq, u64 delay);
-#else
-static inline int hrtick_enabled(struct rq *rq)
-{
-        return 0;
-}
-#endif /* CONFIG_SCHED_HRTICK */
-#ifdef CONFIG_SMP
-extern void sched_avg_update(struct rq *rq);
-static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
-{
-        rq->rt_avg += rt_delta;
-        sched_avg_update(rq);
-}
-#else
-static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
-static inline void sched_avg_update(struct rq *rq) { }
-#endif
-extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
-#ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPT
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
-/*
- * fair double_lock_balance: Safely acquires both rq->locks in a fair
- * way at the expense of forcing extra atomic operations in all
- * invocations.  This assures that the double_lock is acquired using the
- * same underlying policy as the spinlock_t on this architecture, which
- * reduces latency compared to the unfair variant below.  However, it
- * also adds more overhead and therefore may reduce throughput.
- */
-static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
-        __releases(this_rq->lock)
-        __acquires(busiest->lock)
-        __acquires(this_rq->lock)
-{
-        raw_spin_unlock(&this_rq->lock);
-        double_rq_lock(this_rq, busiest);
-        return 1;
-}
-#else
-/*
- * Unfair double_lock_balance: Optimizes throughput at the expense of
- * latency by eliminating extra atomic operations when the locks are
- * already in proper order on entry.  This favors lower cpu-ids and will
- * grant the double lock to lower cpus over higher ids under contention,
- * regardless of entry order into the function.
- */
-static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
-        __releases(this_rq->lock)
-        __acquires(busiest->lock)
-        __acquires(this_rq->lock)
-{
-        int ret = 0;
-        if (unlikely(!raw_spin_trylock(&busiest->lock))) {
-                if (busiest < this_rq) {
-                        raw_spin_unlock(&this_rq->lock);
-                        raw_spin_lock(&busiest->lock);
-                        raw_spin_lock_nested(&this_rq->lock,
-                                              SINGLE_DEPTH_NESTING);
-                        ret = 1;
-                } else
-                        raw_spin_lock_nested(&busiest->lock,
-                                              SINGLE_DEPTH_NESTING);
-        }
-        return ret;
-}
-#endif /* CONFIG_PREEMPT */
-/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-{
-        if (unlikely(!irqs_disabled())) {
-                /* printk() doesn't work good under rq->lock */
-                raw_spin_unlock(&this_rq->lock);
-                BUG_ON(1);
-        }
-        return _double_lock_balance(this_rq, busiest);
-}
-static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-        __releases(busiest->lock)
-{
-        raw_spin_unlock(&busiest->lock);
-        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
-        __acquires(rq1->lock)
-        __acquires(rq2->lock)
-{
-        BUG_ON(!irqs_disabled());
-        if (rq1 == rq2) {
-                raw_spin_lock(&rq1->lock);
-                __acquire(rq2->lock);   /* Fake it out ;) */
-        } else {
-                if (rq1 < rq2) {
-                        raw_spin_lock(&rq1->lock);
-                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
-                } else {
-                        raw_spin_lock(&rq2->lock);
-                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
-                }
-        }
-}
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-        __releases(rq1->lock)
-        __releases(rq2->lock)
-{
-        raw_spin_unlock(&rq1->lock);
-        if (rq1 != rq2)
-                raw_spin_unlock(&rq2->lock);
-        else
-                __release(rq2->lock);
-}
-#else /* CONFIG_SMP */
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
-        __acquires(rq1->lock)
-        __acquires(rq2->lock)
-{
-        BUG_ON(!irqs_disabled());
-        BUG_ON(rq1 != rq2);
-        raw_spin_lock(&rq1->lock);
-        __acquire(rq2->lock);   /* Fake it out ;) */
-}
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-        __releases(rq1->lock)
-        __releases(rq2->lock)
-{
-        BUG_ON(rq1 != rq2);
-        raw_spin_unlock(&rq1->lock);
-        __release(rq2->lock);
-}
-#endif
-extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
-extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
-extern void print_cfs_stats(struct seq_file *m, int cpu);
-extern void print_rt_stats(struct seq_file *m, int cpu);
-extern void init_cfs_rq(struct cfs_rq *cfs_rq);
-extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
-#ifdef CONFIG_NO_HZ
-enum rq_nohz_flag_bits {
-        NOHZ_TICK_STOPPED,
-        NOHZ_BALANCE_KICK,
-        NOHZ_IDLE,
-};
-#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
-#endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-DECLARE_PER_CPU(u64, cpu_hardirq_time);
-DECLARE_PER_CPU(u64, cpu_softirq_time);
-#ifndef CONFIG_64BIT
-DECLARE_PER_CPU(seqcount_t, irq_time_seq);
-static inline void irq_time_write_begin(void)
-{
-        __this_cpu_inc(irq_time_seq.sequence);
-        smp_wmb();
-}
-static inline void irq_time_write_end(void)
-{
-        smp_wmb();
-        __this_cpu_inc(irq_time_seq.sequence);
-}
-static inline u64 irq_time_read(int cpu)
-{
-        u64 irq_time;
-        unsigned seq;
-        do {
-                seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
-                irq_time = per_cpu(cpu_softirq_time, cpu) +
-                           per_cpu(cpu_hardirq_time, cpu);
-        } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-        return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-static inline void irq_time_write_end(void)
-{
-}
-static inline u64 irq_time_read(int cpu)
-{
-        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
-}
-#endif /* CONFIG_64BIT */
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
deleted file mode 100644
index 903ffa9e887..00000000000
--- a/kernel/sched/stats.c
+++ /dev/null
@@ -1,111 +0,0 @@
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-#include "sched.h"
-/*
- * bump this up when changing the output format or the meaning of an existing
- * format, so that tools can adapt (or abort)
- */
-#define SCHEDSTAT_VERSION 15
-static int show_schedstat(struct seq_file *seq, void *v)
-{
-        int cpu;
-        int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
-        char *mask_str = kmalloc(mask_len, GFP_KERNEL);
-        if (mask_str == NULL)
-                return -ENOMEM;
-        seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
-        seq_printf(seq, "timestamp %lu\n", jiffies);
-        for_each_online_cpu(cpu) {
-                struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_SMP
-                struct sched_domain *sd;
-                int dcount = 0;
-#endif
-                /* runqueue-specific stats */
-                seq_printf(seq,
-                    "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
-                    cpu, rq->yld_count,
-                    rq->sched_count, rq->sched_goidle,
-                    rq->ttwu_count, rq->ttwu_local,
-                    rq->rq_cpu_time,
-                    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
-                seq_printf(seq, "\n");
-#ifdef CONFIG_SMP
-                /* domain-specific stats */
-                rcu_read_lock();
-                for_each_domain(cpu, sd) {
-                        enum cpu_idle_type itype;
-                        cpumask_scnprintf(mask_str, mask_len,
-                                          sched_domain_span(sd));
-                        seq_printf(seq, "domain%d %s", dcount++, mask_str);
-                        for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
-                                        itype++) {
-                                seq_printf(seq, " %u %u %u %u %u %u %u %u",
-                                    sd->lb_count[itype],
-                                    sd->lb_balanced[itype],
-                                    sd->lb_failed[itype],
-                                    sd->lb_imbalance[itype],
-                                    sd->lb_gained[itype],
-                                    sd->lb_hot_gained[itype],
-                                    sd->lb_nobusyq[itype],
-                                    sd->lb_nobusyg[itype]);
-                        }
-                        seq_printf(seq,
-                                   " %u %u %u %u %u %u %u %u %u %u %u %u\n",
-                            sd->alb_count, sd->alb_failed, sd->alb_pushed,
-                            sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
-                            sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
-                            sd->ttwu_wake_remote, sd->ttwu_move_affine,
-                            sd->ttwu_move_balance);
-                }
-                rcu_read_unlock();
-#endif
-        }
-        kfree(mask_str);
-        return 0;
-}
-static int schedstat_open(struct inode *inode, struct file *file)
-{
-        unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
-        char *buf = kmalloc(size, GFP_KERNEL);
-        struct seq_file *m;
-        int res;
-        if (!buf)
-                return -ENOMEM;
-        res = single_open(file, show_schedstat, NULL);
-        if (!res) {
-                m = file->private_data;
-                m->buf = buf;
-                m->size = size;
-        } else
-                kfree(buf);
-        return res;
-}
-static const struct file_operations proc_schedstat_operations = {
-        .open    = schedstat_open,
-        .read    = seq_read,
-        .llseek  = seq_lseek,
-        .release = single_release,
-};
-static int __init proc_schedstat_init(void)
-{
-        proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
-        return 0;
-}
-module_init(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
deleted file mode 100644
index 2ef90a51ec5..00000000000
--- a/kernel/sched/stats.h
+++ /dev/null
@@ -1,231 +0,0 @@
-#ifdef CONFIG_SCHEDSTATS
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
-{
-        if (rq) {
-                rq->rq_sched_info.run_delay += delta;
-                rq->rq_sched_info.pcount++;
-        }
-}
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
-{
-        if (rq)
-                rq->rq_cpu_time += delta;
-}
-static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
-{
-        if (rq)
-                rq->rq_sched_info.run_delay += delta;
-}
-# define schedstat_inc(rq, field)       do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt)  do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val)        do { var = (val); } while (0)
-#else /* !CONFIG_SCHEDSTATS */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
-{}
-# define schedstat_inc(rq, field)       do { } while (0)
-# define schedstat_add(rq, field, amt)  do { } while (0)
-# define schedstat_set(var, val)        do { } while (0)
-#endif
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-static inline void sched_info_reset_dequeued(struct task_struct *t)
-{
-        t->sched_info.last_queued = 0;
-}
-/*
- * We are interested in knowing how long it was from the *first* time a
- * task was queued to the time that it finally hit a cpu, we call this routine
- * from dequeue_task() to account for possible rq->clock skew across cpus. The
- * delta taken on each cpu would annul the skew.
- */
-static inline void sched_info_dequeued(struct task_struct *t)
-{
-        unsigned long long now = task_rq(t)->clock, delta = 0;
-        if (unlikely(sched_info_on()))
-                if (t->sched_info.last_queued)
-                        delta = now - t->sched_info.last_queued;
-        sched_info_reset_dequeued(t);
-        t->sched_info.run_delay += delta;
-        rq_sched_info_dequeued(task_rq(t), delta);
-}
-/*
- * Called when a task finally hits the cpu.  We can now calculate how
- * long it was waiting to run.  We also note when it began so that we
- * can keep stats on how long its timeslice is.
- */
-static void sched_info_arrive(struct task_struct *t)
-{
-        unsigned long long now = task_rq(t)->clock, delta = 0;
-        if (t->sched_info.last_queued)
-                delta = now - t->sched_info.last_queued;
-        sched_info_reset_dequeued(t);
-        t->sched_info.run_delay += delta;
-        t->sched_info.last_arrival = now;
-        t->sched_info.pcount++;
-        rq_sched_info_arrive(task_rq(t), delta);
-}
-/*
- * This function is only called from enqueue_task(), but also only updates
- * the timestamp if it is already not set.  It's assumed that
- * sched_info_dequeued() will clear that stamp when appropriate.
- */
-static inline void sched_info_queued(struct task_struct *t)
-{
-        if (unlikely(sched_info_on()))
-                if (!t->sched_info.last_queued)
-                        t->sched_info.last_queued = task_rq(t)->clock;
-}
-/*
- * Called when a process ceases being the active-running process, either
- * voluntarily or involuntarily.  Now we can calculate how long we ran.
- * Also, if the process is still in the TASK_RUNNING state, call
- * sched_info_queued() to mark that it has now again started waiting on
- * the runqueue.
- */
-static inline void sched_info_depart(struct task_struct *t)
-{
-        unsigned long long delta = task_rq(t)->clock -
-                                        t->sched_info.last_arrival;
-        rq_sched_info_depart(task_rq(t), delta);
-        if (t->state == TASK_RUNNING)
-                sched_info_queued(t);
-}
-/*
- * Called when tasks are switched involuntarily due, typically, to expiring
- * their time slice.  (This may also be called when switching to or from
- * the idle task.)  We are only called when prev != next.
- */
-static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
-{
-        struct rq *rq = task_rq(prev);
-        /*
-         * prev now departs the cpu.  It's not interesting to record
-         * stats about how efficient we were at scheduling the idle
-         * process, however.
-         */
-        if (prev != rq->idle)
-                sched_info_depart(prev);
-        if (next != rq->idle)
-                sched_info_arrive(next);
-}
-static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
-{
-        if (unlikely(sched_info_on()))
-                __sched_info_switch(prev, next);
-}
-#else
-#define sched_info_queued(t)                    do { } while (0)
-#define sched_info_reset_dequeued(t)    do { } while (0)
-#define sched_info_dequeued(t)                  do { } while (0)
-#define sched_info_switch(t, next)              do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
-/*
- * The following are functions that support scheduler-internal time accounting.
- * These functions are generally called at the timer tick.  None of this depends
- * on CONFIG_SCHEDSTATS.
- */
-/**
- * account_group_user_time - Maintain utime for a thread group.
- *
- * @tsk:        Pointer to task structure.
- * @cputime:    Time value by which to increment the utime field of the
- *              thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the utime field there.
- */
-static inline void account_group_user_time(struct task_struct *tsk,
-                                           cputime_t cputime)
-{
-        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-        if (!cputimer->running)
-                return;
-        raw_spin_lock(&cputimer->lock);
-        cputimer->cputime.utime += cputime;
-        raw_spin_unlock(&cputimer->lock);
-}
-/**
- * account_group_system_time - Maintain stime for a thread group.
- *
- * @tsk:        Pointer to task structure.
- * @cputime:    Time value by which to increment the stime field of the
- *              thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the stime field there.
- */
-static inline void account_group_system_time(struct task_struct *tsk,
-                                             cputime_t cputime)
-{
-        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-        if (!cputimer->running)
-                return;
-        raw_spin_lock(&cputimer->lock);
-        cputimer->cputime.stime += cputime;
-        raw_spin_unlock(&cputimer->lock);
-}
-/**
- * account_group_exec_runtime - Maintain exec runtime for a thread group.
- *
- * @tsk:        Pointer to task structure.
- * @ns:         Time value by which to increment the sum_exec_runtime field
- *              of the thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the sum_exec_runtime field there.
- */
-static inline void account_group_exec_runtime(struct task_struct *tsk,
-                                              unsigned long long ns)
-{
-        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-        if (!cputimer->running)
-                return;
-        raw_spin_lock(&cputimer->lock);
-        cputimer->cputime.sum_exec_runtime += ns;
-        raw_spin_unlock(&cputimer->lock);
-}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
deleted file mode 100644
index da5eb5bed84..00000000000
--- a/kernel/sched/stop_task.c
+++ /dev/null
@@ -1,128 +0,0 @@
-#include "sched.h"
-/*
- * stop-task scheduling class.
- *
- * The stop task is the highest priority task in the system, it preempts
- * everything and will be preempted by nothing.
- *
- * See kernel/stop_machine.c
- */
-#ifdef CONFIG_SMP
-static int
-select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
-{
-        return task_cpu(p); /* stop tasks as never migrate */
-}
-#endif /* CONFIG_SMP */
-static void
-check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
-{
-        /* we're never preempted */
-}
-static struct task_struct *pick_next_task_stop(struct rq *rq)
-{
-        struct task_struct *stop = rq->stop;
-        if (stop && stop->on_rq) {
-                stop->se.exec_start = rq->clock_task;
-                return stop;
-        }
-        return NULL;
-}
-static void
-enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
-{
-        inc_nr_running(rq);
-}
-static void
-dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
-{
-        dec_nr_running(rq);
-}
-static void yield_task_stop(struct rq *rq)
-{
-        BUG(); /* the stop task should never yield, its pointless. */
-}
-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
-{
-        struct task_struct *curr = rq->curr;
-        u64 delta_exec;
-        delta_exec = rq->clock_task - curr->se.exec_start;
-        if (unlikely((s64)delta_exec < 0))
-                delta_exec = 0;
-        schedstat_set(curr->se.statistics.exec_max,
-                        max(curr->se.statistics.exec_max, delta_exec));
-        curr->se.sum_exec_runtime += delta_exec;
-        account_group_exec_runtime(curr, delta_exec);
-        curr->se.exec_start = rq->clock_task;
-        cpuacct_charge(curr, delta_exec);
-}
-static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
-{
-}
-static void set_curr_task_stop(struct rq *rq)
-{
-        struct task_struct *stop = rq->stop;
-        stop->se.exec_start = rq->clock_task;
-}
-static void switched_to_stop(struct rq *rq, struct task_struct *p)
-{
-        BUG(); /* its impossible to change to this class */
-}
-static void
-prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
-{
-        BUG(); /* how!?, what priority? */
-}
-static unsigned int
-get_rr_interval_stop(struct rq *rq, struct task_struct *task)
-{
-        return 0;
-}
-/*
- * Simple, special scheduling class for the per-CPU stop tasks:
- */
-const struct sched_class stop_sched_class = {
-        .next                   = &rt_sched_class,
-        .enqueue_task           = enqueue_task_stop,
-        .dequeue_task           = dequeue_task_stop,
-        .yield_task             = yield_task_stop,
-        .check_preempt_curr     = check_preempt_curr_stop,
-        .pick_next_task         = pick_next_task_stop,
-        .put_prev_task          = put_prev_task_stop,
-#ifdef CONFIG_SMP
-        .select_task_rq         = select_task_rq_stop,
-#endif
-        .set_curr_task          = set_curr_task_stop,
-        .task_tick              = task_tick_stop,
-        .get_rr_interval        = get_rr_interval_stop,
-        .prio_changed           = prio_changed_stop,
-        .switched_to            = switched_to_stop,
-};
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5af44b59377..57d4b13b631 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -3,357 +3,15 @@
 *
 * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
 *
- * Copyright (C) 2012 Google, Inc.
+ * This defines a simple but solid secure-computing mode.
- * Will Drewry <wad@chromium.org>
- *
- * This defines a simple but solid secure-computing facility.
- *
- * Mode 1 uses a fixed list of allowed system calls.
- * Mode 2 allows user-defined system call filters in the form
- *        of Berkeley Packet Filters/Linux Socket Filters.
 */
-#include <linux/atomic.h>
-#include <linux/audit.h>
-#include <linux/compat.h>
-#include <linux/sched.h>
 #include <linux/seccomp.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
 /* #define SECCOMP_DEBUG 1 */
+#define NR_SECCOMP_MODES 1
-#ifdef CONFIG_SECCOMP_FILTER
-#include <asm/syscall.h>
-#include <linux/filter.h>
-#include <linux/ptrace.h>
-#include <linux/security.h>
-#include <linux/slab.h>
-#include <linux/tracehook.h>
-#include <linux/uaccess.h>
-/**
- * struct seccomp_filter - container for seccomp BPF programs
- *
- * @usage: reference count to manage the object lifetime.
- *         get/put helpers should be used when accessing an instance
- *         outside of a lifetime-guarded section.  In general, this
- *         is only needed for handling filters shared across tasks.
- * @prev: points to a previously installed, or inherited, filter
- * @len: the number of instructions in the program
- * @insns: the BPF program instructions to evaluate
- *
- * seccomp_filter objects are organized in a tree linked via the @prev
- * pointer.  For any task, it appears to be a singly-linked list starting
- * with current->seccomp.filter, the most recently attached or inherited filter.
- * However, multiple filters may share a @prev node, by way of fork(), which
- * results in a unidirectional tree existing in memory.  This is similar to
- * how namespaces work.
- *
- * seccomp_filter objects should never be modified after being attached
- * to a task_struct (other than @usage).
- */
-struct seccomp_filter {
-        atomic_t usage;
-        struct seccomp_filter *prev;
-        unsigned short len;  /* Instruction count */
-        struct sock_filter insns[];
-};
-/* Limit any path through the tree to 256KB worth of instructions. */
-#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
-/**
- * get_u32 - returns a u32 offset into data
- * @data: a unsigned 64 bit value
- * @index: 0 or 1 to return the first or second 32-bits
- *
- * This inline exists to hide the length of unsigned long.  If a 32-bit
- * unsigned long is passed in, it will be extended and the top 32-bits will be
- * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
- * properly returned.
- *
- * Endianness is explicitly ignored and left for BPF program authors to manage
- * as per the specific architecture.
- */
-static inline u32 get_u32(u64 data, int index)
-{
-        return ((u32 *)&data)[index];
-}
-/* Helper for bpf_load below. */
-#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
-/**
- * bpf_load: checks and returns a pointer to the requested offset
- * @off: offset into struct seccomp_data to load from
- *
- * Returns the requested 32-bits of data.
- * seccomp_check_filter() should assure that @off is 32-bit aligned
- * and not out of bounds.  Failure to do so is a BUG.
- */
-u32 seccomp_bpf_load(int off)
-{
-        struct pt_regs *regs = task_pt_regs(current);
-        if (off == BPF_DATA(nr))
-                return syscall_get_nr(current, regs);
-        if (off == BPF_DATA(arch))
-                return syscall_get_arch(current, regs);
-        if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
-                unsigned long value;
-                int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
-                int index = !!(off % sizeof(u64));
-                syscall_get_arguments(current, regs, arg, 1, &value);
-                return get_u32(value, index);
-        }
-        if (off == BPF_DATA(instruction_pointer))
-                return get_u32(KSTK_EIP(current), 0);
-        if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
-                return get_u32(KSTK_EIP(current), 1);
-        /* seccomp_check_filter should make this impossible. */
-        BUG();
-}
-/**
- *      seccomp_check_filter - verify seccomp filter code
- *      @filter: filter to verify
- *      @flen: length of filter
- *
- * Takes a previously checked filter (by sk_chk_filter) and
- * redirects all filter code that loads struct sk_buff data
- * and related data through seccomp_bpf_load.  It also
- * enforces length and alignment checking of those loads.
- *
- * Returns 0 if the rule set is legal or -EINVAL if not.
- */
-static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
-{
-        int pc;
-        for (pc = 0; pc < flen; pc++) {
-                struct sock_filter *ftest = &filter[pc];
-                u16 code = ftest->code;
-                u32 k = ftest->k;
-                switch (code) {
-                case BPF_S_LD_W_ABS:
-                        ftest->code = BPF_S_ANC_SECCOMP_LD_W;
-                        /* 32-bit aligned and not out of bounds. */
-                        if (k >= sizeof(struct seccomp_data) || k & 3)
-                                return -EINVAL;
-                        continue;
-                case BPF_S_LD_W_LEN:
-                        ftest->code = BPF_S_LD_IMM;
-                        ftest->k = sizeof(struct seccomp_data);
-                        continue;
-                case BPF_S_LDX_W_LEN:
-                        ftest->code = BPF_S_LDX_IMM;
-                        ftest->k = sizeof(struct seccomp_data);
-                        continue;
-                /* Explicitly include allowed calls. */
-                case BPF_S_RET_K:
-                case BPF_S_RET_A:
-                case BPF_S_ALU_ADD_K:
-                case BPF_S_ALU_ADD_X:
-                case BPF_S_ALU_SUB_K:
-                case BPF_S_ALU_SUB_X:
-                case BPF_S_ALU_MUL_K:
-                case BPF_S_ALU_MUL_X:
-                case BPF_S_ALU_DIV_X:
-                case BPF_S_ALU_AND_K:
-                case BPF_S_ALU_AND_X:
-                case BPF_S_ALU_OR_K:
-                case BPF_S_ALU_OR_X:
-                case BPF_S_ALU_LSH_K:
-                case BPF_S_ALU_LSH_X:
-                case BPF_S_ALU_RSH_K:
-                case BPF_S_ALU_RSH_X:
-                case BPF_S_ALU_NEG:
-                case BPF_S_LD_IMM:
-                case BPF_S_LDX_IMM:
-                case BPF_S_MISC_TAX:
-                case BPF_S_MISC_TXA:
-                case BPF_S_ALU_DIV_K:
-                case BPF_S_LD_MEM:
-                case BPF_S_LDX_MEM:
-                case BPF_S_ST:
-                case BPF_S_STX:
-                case BPF_S_JMP_JA:
-                case BPF_S_JMP_JEQ_K:
-                case BPF_S_JMP_JEQ_X:
-                case BPF_S_JMP_JGE_K:
-                case BPF_S_JMP_JGE_X:
-                case BPF_S_JMP_JGT_K:
-                case BPF_S_JMP_JGT_X:
-                case BPF_S_JMP_JSET_K:
-                case BPF_S_JMP_JSET_X:
-                        continue;
-                default:
-                        return -EINVAL;
-                }
-        }
-        return 0;
-}
-/**
- * seccomp_run_filters - evaluates all seccomp filters against @syscall
- * @syscall: number of the current system call
- *
- * Returns valid seccomp BPF response codes.
- */
-static u32 seccomp_run_filters(int syscall)
-{
-        struct seccomp_filter *f;
-        u32 ret = SECCOMP_RET_ALLOW;
-        /* Ensure unexpected behavior doesn't result in failing open. */
-        if (WARN_ON(current->seccomp.filter == NULL))
-                return SECCOMP_RET_KILL;
-        /*
-         * All filters in the list are evaluated and the lowest BPF return
-         * value always takes priority (ignoring the DATA).
-         */
-        for (f = current->seccomp.filter; f; f = f->prev) {
-                u32 cur_ret = sk_run_filter(NULL, f->insns);
-                if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
-                        ret = cur_ret;
-        }
-        return ret;
-}
-/**
- * seccomp_attach_filter: Attaches a seccomp filter to current.
- * @fprog: BPF program to install
- *
- * Returns 0 on success or an errno on failure.
- */
-static long seccomp_attach_filter(struct sock_fprog *fprog)
-{
-        struct seccomp_filter *filter;
-        unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
-        unsigned long total_insns = fprog->len;
-        long ret;
-        if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
-                return -EINVAL;
-        for (filter = current->seccomp.filter; filter; filter = filter->prev)
-                total_insns += filter->len + 4;  /* include a 4 instr penalty */
-        if (total_insns > MAX_INSNS_PER_PATH)
-                return -ENOMEM;
-        /*
-         * Installing a seccomp filter requires that the task have
-         * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
-         * This avoids scenarios where unprivileged tasks can affect the
-         * behavior of privileged children.
-         */
-        if (!current->no_new_privs &&
-            security_capable_noaudit(current_cred(), current_user_ns(),
-                                     CAP_SYS_ADMIN) != 0)
-                return -EACCES;
-        /* Allocate a new seccomp_filter */
-        filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
-                         GFP_KERNEL|__GFP_NOWARN);
-        if (!filter)
-                return -ENOMEM;
-        atomic_set(&filter->usage, 1);
-        filter->len = fprog->len;
-        /* Copy the instructions from fprog. */
-        ret = -EFAULT;
-        if (copy_from_user(filter->insns, fprog->filter, fp_size))
-                goto fail;
-        /* Check and rewrite the fprog via the skb checker */
-        ret = sk_chk_filter(filter->insns, filter->len);
-        if (ret)
-                goto fail;
-        /* Check and rewrite the fprog for seccomp use */
-        ret = seccomp_check_filter(filter->insns, filter->len);
-        if (ret)
-                goto fail;
-        /*
-         * If there is an existing filter, make it the prev and don't drop its
-         * task reference.
-         */
-        filter->prev = current->seccomp.filter;
-        current->seccomp.filter = filter;
-        return 0;
-fail:
-        kfree(filter);
-        return ret;
-}
-/**
- * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
- * @user_filter: pointer to the user data containing a sock_fprog.
- *
- * Returns 0 on success and non-zero otherwise.
- */
-long seccomp_attach_user_filter(char __user *user_filter)
-{
-        struct sock_fprog fprog;
-        long ret = -EFAULT;
-#ifdef CONFIG_COMPAT
-        if (is_compat_task()) {
-                struct compat_sock_fprog fprog32;
-                if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
-                        goto out;
-                fprog.len = fprog32.len;
-                fprog.filter = compat_ptr(fprog32.filter);
-        } else /* falls through to the if below. */
-#endif
-        if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
-                goto out;
-        ret = seccomp_attach_filter(&fprog);
-out:
-        return ret;
-}
-/* get_seccomp_filter - increments the reference count of the filter on @tsk */
-void get_seccomp_filter(struct task_struct *tsk)
-{
-        struct seccomp_filter *orig = tsk->seccomp.filter;
-        if (!orig)
-                return;
-        /* Reference count is bounded by the number of total processes. */
-        atomic_inc(&orig->usage);
-}
-/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
-void put_seccomp_filter(struct task_struct *tsk)
-{
-        struct seccomp_filter *orig = tsk->seccomp.filter;
-        /* Clean up single-reference branches iteratively. */
-        while (orig && atomic_dec_and_test(&orig->usage)) {
-                struct seccomp_filter *freeme = orig;
-                orig = orig->prev;
-                kfree(freeme);
-        }
-}
-/**
- * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
- * @syscall: syscall number to send to userland
- * @reason: filter-supplied reason code to send to userland (via si_errno)
- *
- * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
- */
-static void seccomp_send_sigsys(int syscall, int reason)
-{
-        struct siginfo info;
-        memset(&info, 0, sizeof(info));
-        info.si_signo = SIGSYS;
-        info.si_code = SYS_SECCOMP;
-        info.si_call_addr = (void __user *)KSTK_EIP(current);
-        info.si_errno = reason;
-        info.si_arch = syscall_get_arch(current, task_pt_regs(current));
-        info.si_syscall = syscall;
-        force_sig_info(SIGSYS, &info, current);
-}
-#endif  /* CONFIG_SECCOMP_FILTER */
 /*
 * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -372,15 +30,13 @@ static int mode1_syscalls_32[] = {
 };
 #endif
-int __secure_computing(int this_syscall)
+void __secure_computing(int this_syscall)
 {
        int mode = current->seccomp.mode;
-        int exit_sig = 0;
+        int * syscall;
-        int *syscall;
-        u32 ret;
        switch (mode) {
-        case SECCOMP_MODE_STRICT:
+        case 1:
                syscall = mode1_syscalls;
 #ifdef CONFIG_COMPAT
                if (is_compat_task())
@@ -388,61 +44,9 @@ int __secure_computing(int this_syscall)
 #endif
                do {
                        if (*syscall == this_syscall)
-                                return 0;
+                                return;
                } while (*++syscall);
-                exit_sig = SIGKILL;
-                ret = SECCOMP_RET_KILL;
-                break;
-#ifdef CONFIG_SECCOMP_FILTER
-        case SECCOMP_MODE_FILTER: {
-                int data;
-                struct pt_regs *regs = task_pt_regs(current);
-                ret = seccomp_run_filters(this_syscall);
-                data = ret & SECCOMP_RET_DATA;
-                ret &= SECCOMP_RET_ACTION;
-                switch (ret) {
-                case SECCOMP_RET_ERRNO:
-                        /* Set the low-order 16-bits as a errno. */
-                        syscall_set_return_value(current, regs,
-                                                 -data, 0);
-                        goto skip;
-                case SECCOMP_RET_TRAP:
-                        /* Show the handler the original registers. */
-                        syscall_rollback(current, regs);
-                        /* Let the filter pass back 16 bits of data. */
-                        seccomp_send_sigsys(this_syscall, data);
-                        goto skip;
-                case SECCOMP_RET_TRACE:
-                        /* Skip these calls if there is no tracer. */
-                        if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
-                                syscall_set_return_value(current, regs,
-                                                         -ENOSYS, 0);
-                                goto skip;
-                        }
-                        /* Allow the BPF to provide the event message */
-                        ptrace_event(PTRACE_EVENT_SECCOMP, data);
-                        /*
-                         * The delivery of a fatal signal during event
-                         * notification may silently skip tracer notification.
-                         * Terminating the task now avoids executing a system
-                         * call that may not be intended.
-                         */
-                        if (fatal_signal_pending(current))
-                                break;
-                        if (syscall_get_nr(current, regs) < 0)
-                                goto skip;  /* Explicit request to skip. */
-                        return 0;
-                case SECCOMP_RET_ALLOW:
-                        return 0;
-                case SECCOMP_RET_KILL:
-                default:
-                        break;
-                }
-                exit_sig = SIGSYS;
                break;
-        }
-#endif
        default:
                BUG();
        }
@@ -450,13 +54,7 @@ int __secure_computing(int this_syscall)
 #ifdef SECCOMP_DEBUG
        dump_stack();
 #endif
-        audit_seccomp(this_syscall, exit_sig, ret);
+        do_exit(SIGKILL);
-        do_exit(exit_sig);
-#ifdef CONFIG_SECCOMP_FILTER
-skip:
-        audit_seccomp(this_syscall, exit_sig, ret);
-#endif
-        return -1;
 }
 long prctl_get_seccomp(void)
@@ -464,48 +62,25 @@ long prctl_get_seccomp(void)
        return current->seccomp.mode;
 }
-/**
+long prctl_set_seccomp(unsigned long seccomp_mode)
- * prctl_set_seccomp: configures current->seccomp.mode
- * @seccomp_mode: requested mode to use
- * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
- *
- * This function may be called repeatedly with a @seccomp_mode of
- * SECCOMP_MODE_FILTER to install additional filters.  Every filter
- * successfully installed will be evaluated (in reverse order) for each system
- * call the task makes.
- *
- * Once current->seccomp.mode is non-zero, it may not be changed.
- *
- * Returns 0 on success or -EINVAL on failure.
- */
-long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 {
-        long ret = -EINVAL;
+        long ret;
-        if (current->seccomp.mode &&
+        /* can set it only once to be even more secure */
-            current->seccomp.mode != seccomp_mode)
+        ret = -EPERM;
+        if (unlikely(current->seccomp.mode))
                goto out;
-        switch (seccomp_mode) {
+        ret = -EINVAL;
-        case SECCOMP_MODE_STRICT:
+        if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
-                ret = 0;
+                current->seccomp.mode = seccomp_mode;
+                set_thread_flag(TIF_SECCOMP);
 #ifdef TIF_NOTSC
                disable_TSC();
 #endif
-                break;
+                ret = 0;
-#ifdef CONFIG_SECCOMP_FILTER
-        case SECCOMP_MODE_FILTER:
-                ret = seccomp_attach_user_filter(filter);
-                if (ret)
-                        goto out;
-                break;
-#endif
-        default:
-                goto out;
        }
-        current->seccomp.mode = seccomp_mode;
+ out:
-        set_thread_flag(TIF_SECCOMP);
-out:
        return ret;
 }
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 4567fc020fe..94a62c0d4ad 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -27,7 +27,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
@@ -54,12 +54,12 @@ void down(struct semaphore *sem)
 {
        unsigned long flags;
-        raw_spin_lock_irqsave(&sem->lock, flags);
+        spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                __down(sem);
-        raw_spin_unlock_irqrestore(&sem->lock, flags);
+        spin_unlock_irqrestore(&sem->lock, flags);
 }
 EXPORT_SYMBOL(down);
@@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem)
        unsigned long flags;
        int result = 0;
-        raw_spin_lock_irqsave(&sem->lock, flags);
+        spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                result = __down_interruptible(sem);
-        raw_spin_unlock_irqrestore(&sem->lock, flags);
+        spin_unlock_irqrestore(&sem->lock, flags);
        return result;
 }
@@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem)
        unsigned long flags;
        int result = 0;
-        raw_spin_lock_irqsave(&sem->lock, flags);
+        spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                result = __down_killable(sem);
-        raw_spin_unlock_irqrestore(&sem->lock, flags);
+        spin_unlock_irqrestore(&sem->lock, flags);
        return result;
 }
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable);
 * down_trylock - try to acquire the semaphore, without waiting
 * @sem: the semaphore to be acquired
 *
- * Try to acquire the semaphore atomically.  Returns 0 if the semaphore has
+ * Try to acquire the semaphore atomically.  Returns 0 if the mutex has
 * been acquired successfully or 1 if it it cannot be acquired.
 *
 * NOTE: This return value is inverted from both spin_trylock and
@@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem)
        unsigned long flags;
        int count;
-        raw_spin_lock_irqsave(&sem->lock, flags);
+        spin_lock_irqsave(&sem->lock, flags);
        count = sem->count - 1;
        if (likely(count >= 0))
                sem->count = count;
-        raw_spin_unlock_irqrestore(&sem->lock, flags);
+        spin_unlock_irqrestore(&sem->lock, flags);
        return (count < 0);
 }
@@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies)
        unsigned long flags;
        int result = 0;
-        raw_spin_lock_irqsave(&sem->lock, flags);
+        spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                result = __down_timeout(sem, jiffies);
-        raw_spin_unlock_irqrestore(&sem->lock, flags);
+        spin_unlock_irqrestore(&sem->lock, flags);
        return result;
 }
@@ -179,12 +179,12 @@ void up(struct semaphore *sem)
 {
        unsigned long flags;
-        raw_spin_lock_irqsave(&sem->lock, flags);
+        spin_lock_irqsave(&sem->lock, flags);
        if (likely(list_empty(&sem->wait_list)))
                sem->count++;
        else
                __up(sem);
-        raw_spin_unlock_irqrestore(&sem->lock, flags);
+        spin_unlock_irqrestore(&sem->lock, flags);
 }
 EXPORT_SYMBOL(up);
@@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
                if (timeout <= 0)
                        goto timed_out;
                __set_task_state(task, state);
-                raw_spin_unlock_irq(&sem->lock);
+                spin_unlock_irq(&sem->lock);
                timeout = schedule_timeout(timeout);
-                raw_spin_lock_irq(&sem->lock);
+                spin_lock_irq(&sem->lock);
                if (waiter.up)
                        return 0;
        }
diff --git a/kernel/signal.c b/kernel/signal.c
index 372771e948c..195331c56ad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -11,13 +11,12 @@
 */
 #include <linux/slab.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
-#include <linux/coredump.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/ptrace.h>
@@ -29,9 +28,6 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
-#include <linux/user_namespace.h>
-#include <linux/uprobes.h>
-#include <linux/compat.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -39,7 +35,6 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/siginfo.h>
-#include <asm/cacheflush.h>
 #include "audit.h"      /* audit_signal_info() */
 /*
@@ -62,20 +57,21 @@ static int sig_handler_ignored(void __user *handler, int sig)
                (handler == SIG_DFL && sig_kernel_ignore(sig));
 }
-static int sig_task_ignored(struct task_struct *t, int sig, bool force)
+static int sig_task_ignored(struct task_struct *t, int sig,
+                int from_ancestor_ns)
 {
        void __user *handler;
        handler = sig_handler(t, sig);
        if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
-                        handler == SIG_DFL && !force)
+                        handler == SIG_DFL && !from_ancestor_ns)
                return 1;
        return sig_handler_ignored(handler, sig);
 }
-static int sig_ignored(struct task_struct *t, int sig, bool force)
+static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
 {
        /*
         * Blocked signals are never ignored, since the
@@ -85,7 +81,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
        if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
                return 0;
-        if (!sig_task_ignored(t, sig, force))
+        if (!sig_task_ignored(t, sig, from_ancestor_ns))
                return 0;
        /*
@@ -163,7 +159,7 @@ void recalc_sigpending(void)
 #define SYNCHRONOUS_MASK \
        (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
-         sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
+         sigmask(SIGTRAP) | sigmask(SIGFPE))
 int next_signal(struct sigpending *pending, sigset_t *mask)
 {
@@ -770,13 +766,14 @@ static int kill_ok_by_cred(struct task_struct *t)
        const struct cred *cred = current_cred();
        const struct cred *tcred = __task_cred(t);
-        if (uid_eq(cred->euid, tcred->suid) ||
+        if (cred->user->user_ns == tcred->user->user_ns &&
-            uid_eq(cred->euid, tcred->uid)  ||
+            (cred->euid == tcred->suid ||
-            uid_eq(cred->uid,  tcred->suid) ||
+             cred->euid == tcred->uid ||
-            uid_eq(cred->uid,  tcred->uid))
+             cred->uid  == tcred->suid ||
+             cred->uid  == tcred->uid))
                return 1;
-        if (ns_capable(tcred->user_ns, CAP_KILL))
+        if (ns_capable(tcred->user->user_ns, CAP_KILL))
                return 1;
        return 0;
@@ -857,7 +854,7 @@ static void ptrace_trap_notify(struct task_struct *t)
 * Returns true if the signal should be actually delivered, otherwise
 * it should be dropped.
 */
-static int prepare_signal(int sig, struct task_struct *p, bool force)
+static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 {
        struct signal_struct *signal = p->signal;
        struct task_struct *t;
@@ -917,7 +914,7 @@ static int prepare_signal(int sig, struct task_struct *p, bool force)
                }
        }
-        return !sig_ignored(p, sig, force);
+        return !sig_ignored(p, sig, from_ancestor_ns);
 }
 /*
@@ -1022,41 +1019,19 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
        return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
-#ifdef CONFIG_USER_NS
-static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
-{
-        if (current_user_ns() == task_cred_xxx(t, user_ns))
-                return;
-        if (SI_FROMKERNEL(info))
-                return;
-        rcu_read_lock();
-        info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
-                                        make_kuid(current_user_ns(), info->si_uid));
-        rcu_read_unlock();
-}
-#else
-static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
-{
-        return;
-}
-#endif
 static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                        int group, int from_ancestor_ns)
 {
        struct sigpending *pending;
        struct sigqueue *q;
        int override_rlimit;
-        int ret = 0, result;
+        trace_signal_generate(sig, info, t);
        assert_spin_locked(&t->sighand->siglock);
-        result = TRACE_SIGNAL_IGNORED;
+        if (!prepare_signal(sig, t, from_ancestor_ns))
-        if (!prepare_signal(sig, t,
+                return 0;
-                        from_ancestor_ns || (info == SEND_SIG_FORCED)))
-                goto ret;
        pending = group ? &t->signal->shared_pending : &t->pending;
        /*
@@ -1064,11 +1039,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
         * exactly one non-rt signal, so that we can get more
         * detailed information about the cause of the signal.
         */
-        result = TRACE_SIGNAL_ALREADY_PENDING;
        if (legacy_queue(pending, sig))
-                goto ret;
+                return 0;
-        result = TRACE_SIGNAL_DELIVERED;
        /*
         * fast-pathed signals for kernel-internal things like SIGSTOP
         * or SIGKILL.
@@ -1101,7 +1073,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                        q->info.si_code = SI_USER;
                        q->info.si_pid = task_tgid_nr_ns(current,
                                                        task_active_pid_ns(t));
-                        q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+                        q->info.si_uid = current_uid();
                        break;
                case (unsigned long) SEND_SIG_PRIV:
                        q->info.si_signo = sig;
@@ -1116,9 +1088,6 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                                q->info.si_pid = 0;
                        break;
                }
-                userns_fixup_signal_uid(&q->info, t);
        } else if (!is_si_special(info)) {
                if (sig >= SIGRTMIN && info->si_code != SI_USER) {
                        /*
@@ -1126,15 +1095,14 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                         * signal was rt and sent by user using something
                         * other than kill().
                         */
-                        result = TRACE_SIGNAL_OVERFLOW_FAIL;
+                        trace_signal_overflow_fail(sig, group, info);
-                        ret = -EAGAIN;
+                        return -EAGAIN;
-                        goto ret;
                } else {
                        /*
                         * This is a silent loss of information.  We still
                         * send the signal, but the *info bits are lost.
                         */
-                        result = TRACE_SIGNAL_LOSE_INFO;
+                        trace_signal_lose_info(sig, group, info);
                }
        }
@@ -1142,9 +1110,7 @@ out_set:
        signalfd_notify(t, sig);
        sigaddset(&pending->signal, sig);
        complete_signal(sig, t, group);
-ret:
+        return 0;
-        trace_signal_generate(sig, info, t, group, result);
-        return ret;
 }
 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
@@ -1160,9 +1126,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
        return __send_signal(sig, info, t, group, from_ancestor_ns);
 }
-static void print_fatal_signal(int signr)
+static void print_fatal_signal(struct pt_regs *regs, int signr)
 {
-        struct pt_regs *regs = signal_pt_regs();
        printk("%s/%d: potentially unexpected fatal signal %d.\n",
                current->comm, task_pid_nr(current), signr);
@@ -1379,22 +1344,13 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
        return error;
 }
-static int kill_as_cred_perm(const struct cred *cred,
-                             struct task_struct *target)
-{
-        const struct cred *pcred = __task_cred(target);
-        if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
-            !uid_eq(cred->uid,  pcred->suid) && !uid_eq(cred->uid,  pcred->uid))
-                return 0;
-        return 1;
-}
 /* like kill_pid_info(), but doesn't use uid/euid of "current" */
-int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
+int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
-                         const struct cred *cred, u32 secid)
+                      uid_t uid, uid_t euid, u32 secid)
 {
        int ret = -EINVAL;
        struct task_struct *p;
+        const struct cred *pcred;
        unsigned long flags;
        if (!valid_signal(sig))
@@ -1406,7 +1362,10 @@ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
                ret = -ESRCH;
                goto out_unlock;
        }
-        if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
+        pcred = __task_cred(p);
+        if (si_fromuser(info) &&
+            euid != pcred->suid && euid != pcred->uid &&
+            uid  != pcred->suid && uid  != pcred->uid) {
                ret = -EPERM;
                goto out_unlock;
        }
@@ -1425,7 +1384,7 @@ out_unlock:
        rcu_read_unlock();
        return ret;
 }
-EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
+EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
 /*
 * kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1586,7 +1545,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
        int sig = q->info.si_signo;
        struct sigpending *pending;
        unsigned long flags;
-        int ret, result;
+        int ret;
        BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
@@ -1595,8 +1554,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
                goto ret;
        ret = 1; /* the signal is ignored */
-        result = TRACE_SIGNAL_IGNORED;
+        if (!prepare_signal(sig, t, 0))
-        if (!prepare_signal(sig, t, false))
                goto out;
        ret = 0;
@@ -1607,7 +1565,6 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
                 */
                BUG_ON(q->info.si_code != SI_TIMER);
                q->info.si_overrun++;
-                result = TRACE_SIGNAL_ALREADY_PENDING;
                goto out;
        }
        q->info.si_overrun = 0;
@@ -1617,9 +1574,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
        list_add_tail(&q->list, &pending->list);
        sigaddset(&pending->signal, sig);
        complete_signal(sig, t, group);
-        result = TRACE_SIGNAL_DELIVERED;
 out:
-        trace_signal_generate(sig, &q->info, t, group, result);
        unlock_task_sighand(t, &flags);
 ret:
        return ret;
@@ -1647,36 +1602,29 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
        BUG_ON(!tsk->ptrace &&
               (tsk->group_leader != tsk || !thread_group_empty(tsk)));
-        if (sig != SIGCHLD) {
-                /*
-                 * This is only possible if parent == real_parent.
-                 * Check if it has changed security domain.
-                 */
-                if (tsk->parent_exec_id != tsk->parent->self_exec_id)
-                        sig = SIGCHLD;
-        }
        info.si_signo = sig;
        info.si_errno = 0;
        /*
-         * We are under tasklist_lock here so our parent is tied to
+         * we are under tasklist_lock here so our parent is tied to
-         * us and cannot change.
+         * us and cannot exit and release its namespace.
         *
-         * task_active_pid_ns will always return the same pid namespace
+         * the only it can is to switch its nsproxy with sys_unshare,
-         * until a task passes through release_task.
+         * bu uncharing pid namespaces is not allowed, so we'll always
+         * see relevant namespace
         *
         * write_lock() currently calls preempt_disable() which is the
         * same as rcu_read_lock(), but according to Oleg, this is not
         * correct to rely on this
         */
        rcu_read_lock();
-        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
+        info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
-        info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
+        info.si_uid = __task_cred(tsk)->uid;
-                                       task_uid(tsk));
        rcu_read_unlock();
-        info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
+        info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
-        info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
+                                tsk->signal->utime));
+        info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
+                                tsk->signal->stime));
        info.si_status = tsk->exit_code & 0x7f;
        if (tsk->exit_code & 0x80)
@@ -1754,8 +1702,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
         * see comment in do_notify_parent() about the following 4 lines
         */
        rcu_read_lock();
-        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
+        info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
-        info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
+        info.si_uid = __task_cred(tsk)->uid;
        rcu_read_unlock();
        info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -1910,7 +1858,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
                preempt_disable();
                read_unlock(&tasklist_lock);
                preempt_enable_no_resched();
-                freezable_schedule();
+                schedule();
        } else {
                /*
                 * By the time we got the lock, our tracer went away.
@@ -1932,6 +1880,13 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
        }
        /*
+         * While in TASK_TRACED, we were considered "frozen enough".
+         * Now that we woke up, it's crucial if we're supposed to be
+         * frozen that we freeze now before running anything substantial.
+         */
+        try_to_freeze();
+        /*
         * We are back.  Now reacquire the siglock before touching
         * last_siginfo, so that we are sure to have synchronized with
         * any signal-sending on another CPU that wants to examine it.
@@ -1958,7 +1913,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
        info.si_signo = signr;
        info.si_code = exit_code;
        info.si_pid = task_pid_vnr(current);
-        info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+        info.si_uid = current_uid();
        /* Let the debugger run.  */
        ptrace_stop(exit_code, why, 1, &info);
@@ -1967,8 +1922,6 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
 void ptrace_notify(int exit_code)
 {
        BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
-        if (unlikely(current->task_works))
-                task_work_run();
        spin_lock_irq(&current->sighand->siglock);
        ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2087,7 +2040,7 @@ static bool do_signal_stop(int signr)
                }
                /* Now we don't run again until woken by SIGCONT or SIGKILL */
-                freezable_schedule();
+                schedule();
                return true;
        } else {
                /*
@@ -2133,9 +2086,10 @@ static void do_jobctl_trap(void)
        }
 }
-static int ptrace_signal(int signr, siginfo_t *info)
+static int ptrace_signal(int signr, siginfo_t *info,
+                         struct pt_regs *regs, void *cookie)
 {
-        ptrace_signal_deliver();
+        ptrace_signal_deliver(regs, cookie);
        /*
         * We do not check sig_kernel_stop(signr) but set this marker
         * unconditionally because we do not know whether debugger will
@@ -2165,11 +2119,8 @@ static int ptrace_signal(int signr, siginfo_t *info)
                info->si_signo = signr;
                info->si_errno = 0;
                info->si_code = SI_USER;
-                rcu_read_lock();
                info->si_pid = task_pid_vnr(current->parent);
-                info->si_uid = from_kuid_munged(current_user_ns(),
+                info->si_uid = task_uid(current->parent);
-                                                task_uid(current->parent));
-                rcu_read_unlock();
        }
        /* If the (new) signal is now blocked, requeue it.  */
@@ -2188,20 +2139,15 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
        struct signal_struct *signal = current->signal;
        int signr;
-        if (unlikely(current->task_works))
+relock:
-                task_work_run();
-        if (unlikely(uprobe_deny_signal()))
-                return 0;
        /*
-         * Do this once, we can't return to user-mode if freezing() == T.
+         * We'll jump back here after any time we were stopped in TASK_STOPPED.
-         * do_signal_stop() and ptrace_stop() do freezable_schedule() and
+         * While in TASK_STOPPED, we were considered "frozen enough".
-         * thus do not need another check after return.
+         * Now that we woke up, it's crucial if we're supposed to be
+         * frozen that we freeze now before running anything substantial.
         */
        try_to_freeze();
-relock:
        spin_lock_irq(&sighand->siglock);
        /*
         * Every stopped thread goes here after wakeup. Check to see if
@@ -2258,7 +2204,8 @@ relock:
                        break; /* will return 0 */
                if (unlikely(current->ptrace) && signr != SIGKILL) {
-                        signr = ptrace_signal(signr, info);
+                        signr = ptrace_signal(signr, info,
+                                              regs, cookie);
                        if (!signr)
                                continue;
                }
@@ -2343,7 +2290,7 @@ relock:
                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
-                                print_fatal_signal(info->si_signo);
+                                print_fatal_signal(regs, info->si_signo);
                        /*
                         * If it was able to dump core, this kills all
                         * other threads in the group and synchronizes with
@@ -2352,7 +2299,7 @@ relock:
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
-                        do_coredump(info);
+                        do_coredump(info->si_signo, info->si_signo, regs);
                }
                /*
@@ -2365,37 +2312,6 @@ relock:
        return signr;
 }
-/**
- * signal_delivered - 
- * @sig:                number of signal being delivered
- * @info:               siginfo_t of signal being delivered
- * @ka:                 sigaction setting that chose the handler
- * @regs:               user register state
- * @stepping:           nonzero if debugger single-step or block-step in use
- *
- * This function should be called when a signal has succesfully been
- * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
- * is always blocked, and the signal itself is blocked unless %SA_NODEFER
- * is set in @ka->sa.sa_flags.  Tracing is notified.
- */
-void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
-                        struct pt_regs *regs, int stepping)
-{
-        sigset_t blocked;
-        /* A signal was successfully delivered, and the
-           saved sigmask was stored on the signal frame,
-           and will be restored by sigreturn.  So we can
-           simply clear the restore sigmask flag.  */
-        clear_restore_sigmask();
-        sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
-        if (!(ka->sa.sa_flags & SA_NODEFER))
-                sigaddset(&blocked, sig);
-        set_current_blocked(&blocked);
-        tracehook_signal_handler(sig, info, ka, regs, stepping);
-}
 /*
 * It could be that complete_signal() picked us to notify about the
 * group-wide signal. Other threads should be notified now to take
@@ -2433,15 +2349,8 @@ void exit_signals(struct task_struct *tsk)
        int group_stop = 0;
        sigset_t unblocked;
-        /*
-         * @tsk is about to have PF_EXITING set - lock out users which
-         * expect stable threadgroup.
-         */
-        threadgroup_change_begin(tsk);
        if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
                tsk->flags |= PF_EXITING;
-                threadgroup_change_end(tsk);
                return;
        }
@@ -2451,9 +2360,6 @@ void exit_signals(struct task_struct *tsk)
         * see wants_signal(), do_signal_stop().
         */
        tsk->flags |= PF_EXITING;
-        threadgroup_change_end(tsk);
        if (!signal_pending(tsk))
                goto out;
@@ -2526,13 +2432,7 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
 * It is wrong to change ->blocked directly, this helper should be used
 * to ensure the process can't miss a shared signal we are going to block.
 */
-void set_current_blocked(sigset_t *newset)
+void set_current_blocked(const sigset_t *newset)
-{
-        sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
-        __set_current_blocked(newset);
-}
-void __set_current_blocked(const sigset_t *newset)
 {
        struct task_struct *tsk = current;
@@ -2572,7 +2472,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
                return -EINVAL;
        }
-        __set_current_blocked(&newset);
+        set_current_blocked(&newset);
        return 0;
 }
@@ -2712,13 +2612,6 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
                err |= __put_user(from->si_uid, &to->si_uid);
                err |= __put_user(from->si_ptr, &to->si_ptr);
                break;
-#ifdef __ARCH_SIGSYS
-        case __SI_SYS:
-                err |= __put_user(from->si_call_addr, &to->si_call_addr);
-                err |= __put_user(from->si_syscall, &to->si_syscall);
-                err |= __put_user(from->si_arch, &to->si_arch);
-                break;
-#endif
        default: /* this is just in case for now ... */
                err |= __put_user(from->si_pid, &to->si_pid);
                err |= __put_user(from->si_uid, &to->si_uid);
@@ -2841,7 +2734,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
        info.si_errno = 0;
        info.si_code = SI_USER;
        info.si_pid = task_tgid_vnr(current);
-        info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+        info.si_uid = current_uid();
        return kill_something_info(sig, &info, pid);
 }
@@ -2884,7 +2777,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
        info.si_errno = 0;
        info.si_code = SI_TKILL;
        info.si_pid = task_tgid_vnr(current);
-        info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+        info.si_uid = current_uid();
        return do_send_specific(tgid, pid, sig, &info);
 }
@@ -3092,79 +2985,6 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
 out:
        return error;
 }
-#ifdef CONFIG_GENERIC_SIGALTSTACK
-SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
-{
-        return do_sigaltstack(uss, uoss, current_user_stack_pointer());
-}
-#endif
-int restore_altstack(const stack_t __user *uss)
-{
-        int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
-        /* squash all but EFAULT for now */
-        return err == -EFAULT ? err : 0;
-}
-int __save_altstack(stack_t __user *uss, unsigned long sp)
-{
-        struct task_struct *t = current;
-        return  __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
-                __put_user(sas_ss_flags(sp), &uss->ss_flags) |
-                __put_user(t->sas_ss_size, &uss->ss_size);
-}
-#ifdef CONFIG_COMPAT
-#ifdef CONFIG_GENERIC_SIGALTSTACK
-asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr,
-                                       compat_stack_t __user *uoss_ptr)
-{
-        stack_t uss, uoss;
-        int ret;
-        mm_segment_t seg;
-        if (uss_ptr) {
-                compat_stack_t uss32;
-                memset(&uss, 0, sizeof(stack_t));
-                if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
-                        return -EFAULT;
-                uss.ss_sp = compat_ptr(uss32.ss_sp);
-                uss.ss_flags = uss32.ss_flags;
-                uss.ss_size = uss32.ss_size;
-        }
-        seg = get_fs();
-        set_fs(KERNEL_DS);
-        ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
-                             (stack_t __force __user *) &uoss,
-                             compat_user_stack_pointer());
-        set_fs(seg);
-        if (ret >= 0 && uoss_ptr)  {
-                if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
-                    __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
-                    __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
-                    __put_user(uoss.ss_size, &uoss_ptr->ss_size))
-                        ret = -EFAULT;
-        }
-        return ret;
-}
-int compat_restore_altstack(const compat_stack_t __user *uss)
-{
-        int err = compat_sys_sigaltstack(uss, NULL);
-        /* squash all but -EFAULT for now */
-        return err == -EFAULT ? err : 0;
-}
-int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
-{
-        struct task_struct *t = current;
-        return  __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
-                __put_user(sas_ss_flags(sp), &uss->ss_flags) |
-                __put_user(t->sas_ss_size, &uss->ss_size);
-}
-#endif
-#endif
 #ifdef __ARCH_WANT_SYS_SIGPENDING
@@ -3201,6 +3021,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
        if (nset) {
                if (copy_from_user(&new_set, nset, sizeof(*nset)))
                        return -EFAULT;
+                new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
                new_blocked = current->blocked;
@@ -3282,7 +3103,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
        int old = current->blocked.sig[0];
        sigset_t newset;
-        siginitset(&newset, newmask);
+        siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
        set_current_blocked(&newset);
        return old;
@@ -3321,17 +3142,6 @@ SYSCALL_DEFINE0(pause)
 #endif
-int sigsuspend(sigset_t *set)
-{
-        current->saved_sigmask = current->blocked;
-        set_current_blocked(set);
-        current->state = TASK_INTERRUPTIBLE;
-        schedule();
-        set_restore_sigmask();
-        return -ERESTARTNOHAND;
-}
 #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
 /**
 *  sys_rt_sigsuspend - replace the signal mask for a value with the
@@ -3349,7 +3159,15 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
        if (copy_from_user(&newset, unewset, sizeof(newset)))
                return -EFAULT;
-        return sigsuspend(&newset);
+        sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+        current->saved_sigmask = current->blocked;
+        set_current_blocked(&newset);
+        current->state = TASK_INTERRUPTIBLE;
+        schedule();
+        set_restore_sigmask();
+        return -ERESTARTNOHAND;
 }
 #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/smp.c b/kernel/smp.c
index 29dd40a9f2f..fb67dfa8394 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -6,15 +6,13 @@
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/kernel.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/gfp.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
-#include "smpboot.h"
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 static struct {
        struct list_head        queue;
@@ -581,6 +579,26 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
        return 0;
 }
 EXPORT_SYMBOL(smp_call_function);
+void ipi_call_lock(void)
+{
+        raw_spin_lock(&call_function.lock);
+}
+void ipi_call_unlock(void)
+{
+        raw_spin_unlock(&call_function.lock);
+}
+void ipi_call_lock_irq(void)
+{
+        raw_spin_lock_irq(&call_function.lock);
+}
+void ipi_call_unlock_irq(void)
+{
+        raw_spin_unlock_irq(&call_function.lock);
+}
 #endif /* USE_GENERIC_SMP_HELPERS */
 /* Setup configured maximum number of CPUs to activate */
@@ -651,8 +669,6 @@ void __init smp_init(void)
 {
        unsigned int cpu;
-        idle_threads_init();
        /* FIXME: This should be done in userspace --RR */
        for_each_present_cpu(cpu) {
                if (num_online_cpus() >= setup_max_cpus)
@@ -685,116 +701,3 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
        return ret;
 }
 EXPORT_SYMBOL(on_each_cpu);
-/**
- * on_each_cpu_mask(): Run a function on processors specified by
- * cpumask, which may include the local processor.
- * @mask: The set of cpus to run on (only runs on online subset).
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed
- *        on other CPUs.
- *
- * If @wait is true, then returns once @func has returned.
- *
- * You must not call this function with disabled interrupts or
- * from a hardware interrupt handler or from a bottom half handler.
- */
-void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
-                        void *info, bool wait)
-{
-        int cpu = get_cpu();
-        smp_call_function_many(mask, func, info, wait);
-        if (cpumask_test_cpu(cpu, mask)) {
-                local_irq_disable();
-                func(info);
-                local_irq_enable();
-        }
-        put_cpu();
-}
-EXPORT_SYMBOL(on_each_cpu_mask);
-/*
- * on_each_cpu_cond(): Call a function on each processor for which
- * the supplied function cond_func returns true, optionally waiting
- * for all the required CPUs to finish. This may include the local
- * processor.
- * @cond_func:  A callback function that is passed a cpu id and
- *              the the info parameter. The function is called
- *              with preemption disabled. The function should
- *              return a blooean value indicating whether to IPI
- *              the specified CPU.
- * @func:       The function to run on all applicable CPUs.
- *              This must be fast and non-blocking.
- * @info:       An arbitrary pointer to pass to both functions.
- * @wait:       If true, wait (atomically) until function has
- *              completed on other CPUs.
- * @gfp_flags:  GFP flags to use when allocating the cpumask
- *              used internally by the function.
- *
- * The function might sleep if the GFP flags indicates a non
- * atomic allocation is allowed.
- *
- * Preemption is disabled to protect against CPUs going offline but not online.
- * CPUs going online during the call will not be seen or sent an IPI.
- *
- * You must not call this function with disabled interrupts or
- * from a hardware interrupt handler or from a bottom half handler.
- */
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
-                        smp_call_func_t func, void *info, bool wait,
-                        gfp_t gfp_flags)
-{
-        cpumask_var_t cpus;
-        int cpu, ret;
-        might_sleep_if(gfp_flags & __GFP_WAIT);
-        if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
-                preempt_disable();
-                for_each_online_cpu(cpu)
-                        if (cond_func(cpu, info))
-                                cpumask_set_cpu(cpu, cpus);
-                on_each_cpu_mask(cpus, func, info, wait);
-                preempt_enable();
-                free_cpumask_var(cpus);
-        } else {
-                /*
-                 * No free cpumask, bother. No matter, we'll
-                 * just have to IPI them one by one.
-                 */
-                preempt_disable();
-                for_each_online_cpu(cpu)
-                        if (cond_func(cpu, info)) {
-                                ret = smp_call_function_single(cpu, func,
-                                                                info, wait);
-                                WARN_ON_ONCE(!ret);
-                        }
-                preempt_enable();
-        }
-}
-EXPORT_SYMBOL(on_each_cpu_cond);
-static void do_nothing(void *unused)
-{
-}
-/**
- * kick_all_cpus_sync - Force all cpus out of idle
- *
- * Used to synchronize the update of pm_idle function pointer. It's
- * called after the pointer is updated and returns after the dummy
- * callback function has been executed on all cpus. The execution of
- * the function can only happen on the remote cpus after they have
- * left the idle function which had been called via pm_idle function
- * pointer. So it's guaranteed that nothing uses the previous pointer
- * anymore.
- */
-void kick_all_cpus_sync(void)
-{
-        /* Make sure the change is visible before we kick the cpus */
-        smp_mb();
-        smp_call_function(do_nothing, NULL, 1);
-}
-EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
deleted file mode 100644
index d6c5fc05424..00000000000
--- a/kernel/smpboot.c
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Common SMP CPU bringup/teardown functions
- */
-#include <linux/cpu.h>
-#include <linux/err.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/export.h>
-#include <linux/percpu.h>
-#include <linux/kthread.h>
-#include <linux/smpboot.h>
-#include "smpboot.h"
-#ifdef CONFIG_SMP
-#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
-/*
- * For the hotplug case we keep the task structs around and reuse
- * them.
- */
-static DEFINE_PER_CPU(struct task_struct *, idle_threads);
-struct task_struct * __cpuinit idle_thread_get(unsigned int cpu)
-{
-        struct task_struct *tsk = per_cpu(idle_threads, cpu);
-        if (!tsk)
-                return ERR_PTR(-ENOMEM);
-        init_idle(tsk, cpu);
-        return tsk;
-}
-void __init idle_thread_set_boot_cpu(void)
-{
-        per_cpu(idle_threads, smp_processor_id()) = current;
-}
-/**
- * idle_init - Initialize the idle thread for a cpu
- * @cpu:        The cpu for which the idle thread should be initialized
- *
- * Creates the thread if it does not exist.
- */
-static inline void idle_init(unsigned int cpu)
-{
-        struct task_struct *tsk = per_cpu(idle_threads, cpu);
-        if (!tsk) {
-                tsk = fork_idle(cpu);
-                if (IS_ERR(tsk))
-                        pr_err("SMP: fork_idle() failed for CPU %u\n", cpu);
-                else
-                        per_cpu(idle_threads, cpu) = tsk;
-        }
-}
-/**
- * idle_threads_init - Initialize idle threads for all cpus
- */
-void __init idle_threads_init(void)
-{
-        unsigned int cpu, boot_cpu;
-        boot_cpu = smp_processor_id();
-        for_each_possible_cpu(cpu) {
-                if (cpu != boot_cpu)
-                        idle_init(cpu);
-        }
-}
-#endif
-#endif /* #ifdef CONFIG_SMP */
-static LIST_HEAD(hotplug_threads);
-static DEFINE_MUTEX(smpboot_threads_lock);
-struct smpboot_thread_data {
-        unsigned int                    cpu;
-        unsigned int                    status;
-        struct smp_hotplug_thread       *ht;
-};
-enum {
-        HP_THREAD_NONE = 0,
-        HP_THREAD_ACTIVE,
-        HP_THREAD_PARKED,
-};
-/**
- * smpboot_thread_fn - percpu hotplug thread loop function
- * @data:       thread data pointer
- *
- * Checks for thread stop and park conditions. Calls the necessary
- * setup, cleanup, park and unpark functions for the registered
- * thread.
- *
- * Returns 1 when the thread should exit, 0 otherwise.
- */
-static int smpboot_thread_fn(void *data)
-{
-        struct smpboot_thread_data *td = data;
-        struct smp_hotplug_thread *ht = td->ht;
-        while (1) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                preempt_disable();
-                if (kthread_should_stop()) {
-                        set_current_state(TASK_RUNNING);
-                        preempt_enable();
-                        if (ht->cleanup)
-                                ht->cleanup(td->cpu, cpu_online(td->cpu));
-                        kfree(td);
-                        return 0;
-                }
-                if (kthread_should_park()) {
-                        __set_current_state(TASK_RUNNING);
-                        preempt_enable();
-                        if (ht->park && td->status == HP_THREAD_ACTIVE) {
-                                BUG_ON(td->cpu != smp_processor_id());
-                                ht->park(td->cpu);
-                                td->status = HP_THREAD_PARKED;
-                        }
-                        kthread_parkme();
-                        /* We might have been woken for stop */
-                        continue;
-                }
-                BUG_ON(td->cpu != smp_processor_id());
-                /* Check for state change setup */
-                switch (td->status) {
-                case HP_THREAD_NONE:
-                        preempt_enable();
-                        if (ht->setup)
-                                ht->setup(td->cpu);
-                        td->status = HP_THREAD_ACTIVE;
-                        preempt_disable();
-                        break;
-                case HP_THREAD_PARKED:
-                        preempt_enable();
-                        if (ht->unpark)
-                                ht->unpark(td->cpu);
-                        td->status = HP_THREAD_ACTIVE;
-                        preempt_disable();
-                        break;
-                }
-                if (!ht->thread_should_run(td->cpu)) {
-                        preempt_enable();
-                        schedule();
-                } else {
-                        set_current_state(TASK_RUNNING);
-                        preempt_enable();
-                        ht->thread_fn(td->cpu);
-                }
-        }
-}
-static int
-__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
-{
-        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-        struct smpboot_thread_data *td;
-        if (tsk)
-                return 0;
-        td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
-        if (!td)
-                return -ENOMEM;
-        td->cpu = cpu;
-        td->ht = ht;
-        tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
-                                    ht->thread_comm);
-        if (IS_ERR(tsk)) {
-                kfree(td);
-                return PTR_ERR(tsk);
-        }
-        get_task_struct(tsk);
-        *per_cpu_ptr(ht->store, cpu) = tsk;
-        return 0;
-}
-int smpboot_create_threads(unsigned int cpu)
-{
-        struct smp_hotplug_thread *cur;
-        int ret = 0;
-        mutex_lock(&smpboot_threads_lock);
-        list_for_each_entry(cur, &hotplug_threads, list) {
-                ret = __smpboot_create_thread(cur, cpu);
-                if (ret)
-                        break;
-        }
-        mutex_unlock(&smpboot_threads_lock);
-        return ret;
-}
-static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
-{
-        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-        kthread_unpark(tsk);
-}
-void smpboot_unpark_threads(unsigned int cpu)
-{
-        struct smp_hotplug_thread *cur;
-        mutex_lock(&smpboot_threads_lock);
-        list_for_each_entry(cur, &hotplug_threads, list)
-                smpboot_unpark_thread(cur, cpu);
-        mutex_unlock(&smpboot_threads_lock);
-}
-static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
-{
-        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-        if (tsk)
-                kthread_park(tsk);
-}
-void smpboot_park_threads(unsigned int cpu)
-{
-        struct smp_hotplug_thread *cur;
-        mutex_lock(&smpboot_threads_lock);
-        list_for_each_entry_reverse(cur, &hotplug_threads, list)
-                smpboot_park_thread(cur, cpu);
-        mutex_unlock(&smpboot_threads_lock);
-}
-static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
-{
-        unsigned int cpu;
-        /* We need to destroy also the parked threads of offline cpus */
-        for_each_possible_cpu(cpu) {
-                struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-                if (tsk) {
-                        kthread_stop(tsk);
-                        put_task_struct(tsk);
-                        *per_cpu_ptr(ht->store, cpu) = NULL;
-                }
-        }
-}
-/**
- * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
- * @plug_thread:        Hotplug thread descriptor
- *
- * Creates and starts the threads on all online cpus.
- */
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
-{
-        unsigned int cpu;
-        int ret = 0;
-        mutex_lock(&smpboot_threads_lock);
-        for_each_online_cpu(cpu) {
-                ret = __smpboot_create_thread(plug_thread, cpu);
-                if (ret) {
-                        smpboot_destroy_threads(plug_thread);
-                        goto out;
-                }
-                smpboot_unpark_thread(plug_thread, cpu);
-        }
-        list_add(&plug_thread->list, &hotplug_threads);
-out:
-        mutex_unlock(&smpboot_threads_lock);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
-/**
- * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
- * @plug_thread:        Hotplug thread descriptor
- *
- * Stops all threads on all possible cpus.
- */
-void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
-{
-        get_online_cpus();
-        mutex_lock(&smpboot_threads_lock);
-        list_del(&plug_thread->list);
-        smpboot_destroy_threads(plug_thread);
-        mutex_unlock(&smpboot_threads_lock);
-        put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
deleted file mode 100644
index 72415a0eb95..00000000000
--- a/kernel/smpboot.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef SMPBOOT_H
-#define SMPBOOT_H
-struct task_struct;
-#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
-struct task_struct *idle_thread_get(unsigned int cpu);
-void idle_thread_set_boot_cpu(void);
-void idle_threads_init(void);
-#else
-static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; }
-static inline void idle_thread_set_boot_cpu(void) { }
-static inline void idle_threads_init(void) { }
-#endif
-int smpboot_create_threads(unsigned int cpu);
-void smpboot_park_threads(unsigned int cpu);
-void smpboot_unpark_threads(unsigned int cpu);
-#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ed567babe78..fca82c32042 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -10,7 +10,7 @@
 *      Remote softirq infrastructure is by Jens Axboe.
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/kernel_stat.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
@@ -23,7 +23,6 @@
 #include <linux/rcupdate.h>
 #include <linux/ftrace.h>
 #include <linux/smp.h>
-#include <linux/smpboot.h>
 #include <linux/tick.h>
 #define CREATE_TRACE_POINTS
@@ -211,17 +210,9 @@ asmlinkage void __do_softirq(void)
        __u32 pending;
        int max_restart = MAX_SOFTIRQ_RESTART;
        int cpu;
-        unsigned long old_flags = current->flags;
-        /*
-         * Mask out PF_MEMALLOC s current task context is borrowed for the
-         * softirq. A softirq handled such as network RX might set PF_MEMALLOC
-         * again if the socket is related to swap
-         */
-        current->flags &= ~PF_MEMALLOC;
        pending = local_softirq_pending();
-        vtime_account_irq_enter(current);
+        account_system_vtime(current);
        __local_bh_disable((unsigned long)__builtin_return_address(0),
                                SOFTIRQ_OFFSET);
@@ -272,9 +263,8 @@ restart:
        lockdep_softirq_exit();
-        vtime_account_irq_exit(current);
+        account_system_vtime(current);
        __local_bh_enable(SOFTIRQ_OFFSET);
-        tsk_restore_flags(current, old_flags, PF_MEMALLOC);
 }
 #ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -307,7 +297,7 @@ void irq_enter(void)
        int cpu = smp_processor_id();
        rcu_irq_enter();
-        if (is_idle_task(current) && !in_interrupt()) {
+        if (idle_cpu(cpu) && !in_interrupt()) {
                /*
                 * Prevent raise_softirq from needlessly waking up ksoftirqd
                 * here, as softirq will be serviced on return from interrupt.
@@ -320,40 +310,50 @@ void irq_enter(void)
        __irq_enter();
 }
+#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
 static inline void invoke_softirq(void)
 {
-        if (!force_irqthreads) {
+        if (!force_irqthreads)
-#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
                __do_softirq();
+        else {
+                __local_bh_disable((unsigned long)__builtin_return_address(0),
+                                SOFTIRQ_OFFSET);
+                wakeup_softirqd();
+                __local_bh_enable(SOFTIRQ_OFFSET);
+        }
+}
 #else
+static inline void invoke_softirq(void)
+{
+        if (!force_irqthreads)
                do_softirq();
-#endif
+        else {
-        } else {
                __local_bh_disable((unsigned long)__builtin_return_address(0),
                                SOFTIRQ_OFFSET);
                wakeup_softirqd();
                __local_bh_enable(SOFTIRQ_OFFSET);
        }
 }
+#endif
 /*
 * Exit an interrupt context. Process softirqs if needed and possible:
 */
 void irq_exit(void)
 {
-        vtime_account_irq_exit(current);
+        account_system_vtime(current);
        trace_hardirq_exit();
        sub_preempt_count(IRQ_EXIT_OFFSET);
        if (!in_interrupt() && local_softirq_pending())
                invoke_softirq();
+        rcu_irq_exit();
 #ifdef CONFIG_NO_HZ
        /* Make sure that timer wheel updates are propagated */
        if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
-                tick_nohz_irq_exit();
+                tick_nohz_stop_sched_tick(0);
 #endif
-        rcu_irq_exit();
+        preempt_enable_no_resched();
-        sched_preempt_enable_no_resched();
 }
 /*
@@ -385,12 +385,6 @@ void raise_softirq(unsigned int nr)
        local_irq_restore(flags);
 }
-void __raise_softirq_irqoff(unsigned int nr)
-{
-        trace_softirq_raise(nr);
-        or_softirq_pending(1UL << nr);
-}
 void open_softirq(int nr, void (*action)(struct softirq_action *))
 {
        softirq_vec[nr].action = action;
@@ -743,22 +737,51 @@ void __init softirq_init(void)
        open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
-static int ksoftirqd_should_run(unsigned int cpu)
+static int run_ksoftirqd(void * __bind_cpu)
 {
-        return local_softirq_pending();
+        set_current_state(TASK_INTERRUPTIBLE);
-}
-static void run_ksoftirqd(unsigned int cpu)
+        while (!kthread_should_stop()) {
-{
+                preempt_disable();
-        local_irq_disable();
+                if (!local_softirq_pending()) {
-        if (local_softirq_pending()) {
+                        preempt_enable_no_resched();
-                __do_softirq();
+                        schedule();
-                rcu_note_context_switch(cpu);
+                        preempt_disable();
-                local_irq_enable();
+                }
-                cond_resched();
-                return;
+                __set_current_state(TASK_RUNNING);
+                while (local_softirq_pending()) {
+                        /* Preempt disable stops cpu going offline.
+                           If already offline, we'll be on wrong CPU:
+                           don't process */
+                        if (cpu_is_offline((long)__bind_cpu))
+                                goto wait_to_die;
+                        local_irq_disable();
+                        if (local_softirq_pending())
+                                __do_softirq();
+                        local_irq_enable();
+                        preempt_enable_no_resched();
+                        cond_resched();
+                        preempt_disable();
+                        rcu_note_context_switch((long)__bind_cpu);
+                }
+                preempt_enable();
+                set_current_state(TASK_INTERRUPTIBLE);
        }
-        local_irq_enable();
+        __set_current_state(TASK_RUNNING);
+        return 0;
+wait_to_die:
+        preempt_enable();
+        /* Wait for kthread_stop */
+        set_current_state(TASK_INTERRUPTIBLE);
+        while (!kthread_should_stop()) {
+                schedule();
+                set_current_state(TASK_INTERRUPTIBLE);
+        }
+        __set_current_state(TASK_RUNNING);
+        return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -824,14 +847,50 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
                                  unsigned long action,
                                  void *hcpu)
 {
+        int hotcpu = (unsigned long)hcpu;
+        struct task_struct *p;
        switch (action) {
+        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
+                p = kthread_create_on_node(run_ksoftirqd,
+                                           hcpu,
+                                           cpu_to_node(hotcpu),
+                                           "ksoftirqd/%d", hotcpu);
+                if (IS_ERR(p)) {
+                        printk("ksoftirqd for %i failed\n", hotcpu);
+                        return notifier_from_errno(PTR_ERR(p));
+                }
+                kthread_bind(p, hotcpu);
+                per_cpu(ksoftirqd, hotcpu) = p;
+                break;
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                wake_up_process(per_cpu(ksoftirqd, hotcpu));
+                break;
 #ifdef CONFIG_HOTPLUG_CPU
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
+                if (!per_cpu(ksoftirqd, hotcpu))
+                        break;
+                /* Unbind so it can run.  Fall thru. */
+                kthread_bind(per_cpu(ksoftirqd, hotcpu),
+                             cpumask_any(cpu_online_mask));
        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
+        case CPU_DEAD_FROZEN: {
-                takeover_tasklets((unsigned long)hcpu);
+                static const struct sched_param param = {
+                        .sched_priority = MAX_RT_PRIO-1
+                };
+                p = per_cpu(ksoftirqd, hotcpu);
+                per_cpu(ksoftirqd, hotcpu) = NULL;
+                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+                kthread_stop(p);
+                takeover_tasklets(hotcpu);
                break;
-#endif /* CONFIG_HOTPLUG_CPU */
        }
+#endif /* CONFIG_HOTPLUG_CPU */
+        }
        return NOTIFY_OK;
 }
@@ -839,19 +898,14 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
-static struct smp_hotplug_thread softirq_threads = {
-        .store                  = &ksoftirqd,
-        .thread_should_run      = ksoftirqd_should_run,
-        .thread_fn              = run_ksoftirqd,
-        .thread_comm            = "ksoftirqd/%u",
-};
 static __init int spawn_ksoftirqd(void)
 {
-        register_cpu_notifier(&cpu_nfb);
+        void *cpu = (void *)(long)smp_processor_id();
+        int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-        BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
+        BUG_ON(err != NOTIFY_OK);
+        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+        register_cpu_notifier(&cpu_nfb);
        return 0;
 }
 early_initcall(spawn_ksoftirqd);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5cdd8065a3c..be6517fb9c1 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -19,7 +19,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
-#include <linux/export.h>
+#include <linux/module.h>
 /*
 * If lockdep is enabled then we use the non-preemption spin-ops
@@ -163,7 +163,7 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
 EXPORT_SYMBOL(_raw_spin_lock_bh);
 #endif
-#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
+#ifndef CONFIG_INLINE_SPIN_UNLOCK
 void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
 {
        __raw_spin_unlock(lock);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2b859828cdc..73ce23feaea 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -16,17 +16,15 @@
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 * Copyright (C) IBM Corporation, 2006
- * Copyright (C) Fujitsu, 2012
 *
 * Author: Paul McKenney <paulmck@us.ibm.com>
- *         Lai Jiangshan <laijs@cn.fujitsu.com>
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *              Documentation/RCU/ *.txt
 *
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
@@ -36,78 +34,10 @@
 #include <linux/delay.h>
 #include <linux/srcu.h>
-#include <trace/events/rcu.h>
-#include "rcu.h"
-/*
- * Initialize an rcu_batch structure to empty.
- */
-static inline void rcu_batch_init(struct rcu_batch *b)
-{
-        b->head = NULL;
-        b->tail = &b->head;
-}
-/*
- * Enqueue a callback onto the tail of the specified rcu_batch structure.
- */
-static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
-{
-        *b->tail = head;
-        b->tail = &head->next;
-}
-/*
- * Is the specified rcu_batch structure empty?
- */
-static inline bool rcu_batch_empty(struct rcu_batch *b)
-{
-        return b->tail == &b->head;
-}
-/*
- * Remove the callback at the head of the specified rcu_batch structure
- * and return a pointer to it, or return NULL if the structure is empty.
- */
-static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
-{
-        struct rcu_head *head;
-        if (rcu_batch_empty(b))
-                return NULL;
-        head = b->head;
-        b->head = head->next;
-        if (b->tail == &head->next)
-                rcu_batch_init(b);
-        return head;
-}
-/*
- * Move all callbacks from the rcu_batch structure specified by "from" to
- * the structure specified by "to".
- */
-static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
-{
-        if (!rcu_batch_empty(from)) {
-                *to->tail = from->head;
-                to->tail = from->tail;
-                rcu_batch_init(from);
-        }
-}
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
        sp->completed = 0;
-        spin_lock_init(&sp->queue_lock);
+        mutex_init(&sp->mutex);
-        sp->running = false;
-        rcu_batch_init(&sp->batch_queue);
-        rcu_batch_init(&sp->batch_check0);
-        rcu_batch_init(&sp->batch_check1);
-        rcu_batch_init(&sp->batch_done);
-        INIT_DELAYED_WORK(&sp->work, process_srcu);
        sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
        return sp->per_cpu_ref ? 0 : -ENOMEM;
 }
@@ -143,116 +73,21 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /*
- * Returns approximate total of the readers' ->seq[] values for the
+ * srcu_readers_active_idx -- returns approximate number of readers
- * rank of per-CPU counters specified by idx.
+ *      active on the specified rank of per-CPU counters.
 */
-static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
-{
-        int cpu;
-        unsigned long sum = 0;
-        unsigned long t;
-        for_each_possible_cpu(cpu) {
-                t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
-                sum += t;
-        }
-        return sum;
-}
-/*
+static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
- * Returns approximate number of readers active on the specified rank
- * of the per-CPU ->c[] counters.
- */
-static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
 {
        int cpu;
-        unsigned long sum = 0;
+        int sum;
-        unsigned long t;
-        for_each_possible_cpu(cpu) {
+        sum = 0;
-                t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+        for_each_possible_cpu(cpu)
-                sum += t;
+                sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
-        }
        return sum;
 }
-/*
- * Return true if the number of pre-existing readers is determined to
- * be stably zero.  An example unstable zero can occur if the call
- * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
- * but due to task migration, sees the corresponding __srcu_read_unlock()
- * decrement.  This can happen because srcu_readers_active_idx() takes
- * time to sum the array, and might in fact be interrupted or preempted
- * partway through the summation.
- */
-static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
-{
-        unsigned long seq;
-        seq = srcu_readers_seq_idx(sp, idx);
-        /*
-         * The following smp_mb() A pairs with the smp_mb() B located in
-         * __srcu_read_lock().  This pairing ensures that if an
-         * __srcu_read_lock() increments its counter after the summation
-         * in srcu_readers_active_idx(), then the corresponding SRCU read-side
-         * critical section will see any changes made prior to the start
-         * of the current SRCU grace period.
-         *
-         * Also, if the above call to srcu_readers_seq_idx() saw the
-         * increment of ->seq[], then the call to srcu_readers_active_idx()
-         * must see the increment of ->c[].
-         */
-        smp_mb(); /* A */
-        /*
-         * Note that srcu_readers_active_idx() can incorrectly return
-         * zero even though there is a pre-existing reader throughout.
-         * To see this, suppose that task A is in a very long SRCU
-         * read-side critical section that started on CPU 0, and that
-         * no other reader exists, so that the sum of the counters
-         * is equal to one.  Then suppose that task B starts executing
-         * srcu_readers_active_idx(), summing up to CPU 1, and then that
-         * task C starts reading on CPU 0, so that its increment is not
-         * summed, but finishes reading on CPU 2, so that its decrement
-         * -is- summed.  Then when task B completes its sum, it will
-         * incorrectly get zero, despite the fact that task A has been
-         * in its SRCU read-side critical section the whole time.
-         *
-         * We therefore do a validation step should srcu_readers_active_idx()
-         * return zero.
-         */
-        if (srcu_readers_active_idx(sp, idx) != 0)
-                return false;
-        /*
-         * The remainder of this function is the validation step.
-         * The following smp_mb() D pairs with the smp_mb() C in
-         * __srcu_read_unlock().  If the __srcu_read_unlock() was seen
-         * by srcu_readers_active_idx() above, then any destructive
-         * operation performed after the grace period will happen after
-         * the corresponding SRCU read-side critical section.
-         *
-         * Note that there can be at most NR_CPUS worth of readers using
-         * the old index, which is not enough to overflow even a 32-bit
-         * integer.  (Yes, this does mean that systems having more than
-         * a billion or so CPUs need to be 64-bit systems.)  Therefore,
-         * the sum of the ->seq[] counters cannot possibly overflow.
-         * Therefore, the only way that the return values of the two
-         * calls to srcu_readers_seq_idx() can be equal is if there were
-         * no increments of the corresponding rank of ->seq[] counts
-         * in the interim.  But the missed-increment scenario laid out
-         * above includes an increment of the ->seq[] counter by
-         * the corresponding __srcu_read_lock().  Therefore, if this
-         * scenario occurs, the return values from the two calls to
-         * srcu_readers_seq_idx() will differ, and thus the validation
-         * step below suffices.
-         */
-        smp_mb(); /* D */
-        return srcu_readers_seq_idx(sp, idx) == seq;
-}
 /**
 * srcu_readers_active - returns approximate number of readers.
 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
@@ -263,14 +98,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
 */
 static int srcu_readers_active(struct srcu_struct *sp)
 {
-        int cpu;
+        return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
-        unsigned long sum = 0;
-        for_each_possible_cpu(cpu) {
-                sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
-                sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
-        }
-        return sum;
 }
 /**
@@ -303,11 +131,10 @@ int __srcu_read_lock(struct srcu_struct *sp)
        int idx;
        preempt_disable();
-        idx = rcu_dereference_index_check(sp->completed,
+        idx = sp->completed & 0x1;
-                                          rcu_read_lock_sched_held()) & 0x1;
+        barrier();  /* ensure compiler looks -once- at sp->completed. */
-        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
+        per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
-        smp_mb(); /* B */  /* Avoid leaking the critical section. */
+        srcu_barrier();  /* ensure compiler won't misorder critical section. */
-        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
        preempt_enable();
        return idx;
 }
@@ -322,8 +149,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
        preempt_disable();
-        smp_mb(); /* C */  /* Avoid leaking the critical section. */
+        srcu_barrier();  /* ensure compiler won't misorder critical section. */
-        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
+        per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -336,119 +163,100 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 * we repeatedly block for 1-millisecond time periods.  This approach
 * has done well in testing, so there is no need for a config parameter.
 */
-#define SRCU_RETRY_CHECK_DELAY          5
+#define SYNCHRONIZE_SRCU_READER_DELAY 10
-#define SYNCHRONIZE_SRCU_TRYCOUNT       2
-#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT   12
 /*
- * @@@ Wait until all pre-existing readers complete.  Such readers
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
- * will have used the index specified by "idx".
- * the caller should ensures the ->completed is not changed while checking
- * and idx = (->completed & 1) ^ 1
 */
-static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
+static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
 {
-        for (;;) {
+        int idx;
-                if (srcu_readers_active_idx_check(sp, idx))
-                        return true;
-                if (--trycount <= 0)
-                        return false;
-                udelay(SRCU_RETRY_CHECK_DELAY);
-        }
-}
-/*
+        idx = sp->completed;
- * Increment the ->completed counter so that future SRCU readers will
+        mutex_lock(&sp->mutex);
- * use the other rank of the ->c[] and ->seq[] arrays.  This allows
- * us to wait for pre-existing readers in a starvation-free manner.
- */
-static void srcu_flip(struct srcu_struct *sp)
-{
-        sp->completed++;
-}
-/*
+        /*
- * Enqueue an SRCU callback on the specified srcu_struct structure,
+         * Check to see if someone else did the work for us while we were
- * initiating grace-period processing if it is not already running.
+         * waiting to acquire the lock.  We need -two- advances of
- */
+         * the counter, not just one.  If there was but one, we might have
-void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+         * shown up -after- our helper's first synchronize_sched(), thus
-                void (*func)(struct rcu_head *head))
+         * having failed to prevent CPU-reordering races with concurrent
-{
+         * srcu_read_unlock()s on other CPUs (see comment below).  So we
-        unsigned long flags;
+         * either (1) wait for two or (2) supply the second ourselves.
+         */
-        head->next = NULL;
-        head->func = func;
+        if ((sp->completed - idx) >= 2) {
-        spin_lock_irqsave(&sp->queue_lock, flags);
+                mutex_unlock(&sp->mutex);
-        rcu_batch_queue(&sp->batch_queue, head);
+                return;
-        if (!sp->running) {
-                sp->running = true;
-                schedule_delayed_work(&sp->work, 0);
        }
-        spin_unlock_irqrestore(&sp->queue_lock, flags);
-}
-EXPORT_SYMBOL_GPL(call_srcu);
-struct rcu_synchronize {
+        sync_func();  /* Force memory barrier on all CPUs. */
-        struct rcu_head head;
-        struct completion completion;
-};
-/*
+        /*
- * Awaken the corresponding synchronize_srcu() instance now that a
+         * The preceding synchronize_sched() ensures that any CPU that
- * grace period has elapsed.
+         * sees the new value of sp->completed will also see any preceding
- */
+         * changes to data structures made by this CPU.  This prevents
-static void wakeme_after_rcu(struct rcu_head *head)
+         * some other CPU from reordering the accesses in its SRCU
-{
+         * read-side critical section to precede the corresponding
-        struct rcu_synchronize *rcu;
+         * srcu_read_lock() -- ensuring that such references will in
+         * fact be protected.
+         *
+         * So it is now safe to do the flip.
+         */
-        rcu = container_of(head, struct rcu_synchronize, head);
+        idx = sp->completed & 0x1;
-        complete(&rcu->completion);
+        sp->completed++;
-}
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
+        sync_func();  /* Force memory barrier on all CPUs. */
-static void srcu_reschedule(struct srcu_struct *sp);
-/*
+        /*
- * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+         * At this point, because of the preceding synchronize_sched(),
- */
+         * all srcu_read_lock() calls using the old counters have completed.
-static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
+         * Their corresponding critical sections might well be still
-{
+         * executing, but the srcu_read_lock() primitives themselves
-        struct rcu_synchronize rcu;
+         * will have finished executing.  We initially give readers
-        struct rcu_head *head = &rcu.head;
+         * an arbitrarily chosen 10 microseconds to get out of their
-        bool done = false;
+         * SRCU read-side critical sections, then loop waiting 1/HZ
+         * seconds per iteration.  The 10-microsecond value has done
-        rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
+         * very well in testing.
-                           !lock_is_held(&rcu_bh_lock_map) &&
+         */
-                           !lock_is_held(&rcu_lock_map) &&
-                           !lock_is_held(&rcu_sched_lock_map),
+        if (srcu_readers_active_idx(sp, idx))
-                           "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+                udelay(SYNCHRONIZE_SRCU_READER_DELAY);
+        while (srcu_readers_active_idx(sp, idx))
-        init_completion(&rcu.completion);
+                schedule_timeout_interruptible(1);
-        head->next = NULL;
+        sync_func();  /* Force memory barrier on all CPUs. */
-        head->func = wakeme_after_rcu;
-        spin_lock_irq(&sp->queue_lock);
-        if (!sp->running) {
-                /* steal the processing owner */
-                sp->running = true;
-                rcu_batch_queue(&sp->batch_check0, head);
-                spin_unlock_irq(&sp->queue_lock);
-                srcu_advance_batches(sp, trycount);
-                if (!rcu_batch_empty(&sp->batch_done)) {
-                        BUG_ON(sp->batch_done.head != head);
-                        rcu_batch_dequeue(&sp->batch_done);
-                        done = true;
-                }
-                /* give the processing owner to work_struct */
-                srcu_reschedule(sp);
-        } else {
-                rcu_batch_queue(&sp->batch_queue, head);
-                spin_unlock_irq(&sp->queue_lock);
-        }
-        if (!done)
+        /*
-                wait_for_completion(&rcu.completion);
+         * The preceding synchronize_sched() forces all srcu_read_unlock()
+         * primitives that were executing concurrently with the preceding
+         * for_each_possible_cpu() loop to have completed by this point.
+         * More importantly, it also forces the corresponding SRCU read-side
+         * critical sections to have also completed, and the corresponding
+         * references to SRCU-protected data items to be dropped.
+         *
+         * Note:
+         *
+         *      Despite what you might think at first glance, the
+         *      preceding synchronize_sched() -must- be within the
+         *      critical section ended by the following mutex_unlock().
+         *      Otherwise, a task taking the early exit can race
+         *      with a srcu_read_unlock(), which might have executed
+         *      just before the preceding srcu_readers_active() check,
+         *      and whose CPU might have reordered the srcu_read_unlock()
+         *      with the preceding critical section.  In this case, there
+         *      is nothing preventing the synchronize_sched() task that is
+         *      taking the early exit from freeing a data structure that
+         *      is still being referenced (out of order) by the task
+         *      doing the srcu_read_unlock().
+         *
+         *      Alternatively, the comparison with "2" on the early exit
+         *      could be changed to "3", but this increases synchronize_srcu()
+         *      latency for bulk loads.  So the current code is preferred.
+         */
+        mutex_unlock(&sp->mutex);
 }
 /**
@@ -467,190 +275,41 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 */
 void synchronize_srcu(struct srcu_struct *sp)
 {
-        __synchronize_srcu(sp, rcu_expedited
+        __synchronize_srcu(sp, synchronize_sched);
-                           ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
-                           : SYNCHRONIZE_SRCU_TRYCOUNT);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 /**
- * synchronize_srcu_expedited - Brute-force SRCU grace period
+ * synchronize_srcu_expedited - like synchronize_srcu, but less patient
 * @sp: srcu_struct with which to synchronize.
 *
- * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * Flip the completed counter, and wait for the old count to drain to zero.
- * spinning rather than blocking when waiting.
+ * As with classic RCU, the updater must use some separate means of
+ * synchronizing concurrent updates.  Can block; must be called from
+ * process context.
 *
- * Note that it is illegal to call this function while holding any lock
+ * Note that it is illegal to call synchronize_srcu_expedited()
- * that is acquired by a CPU-hotplug notifier.  It is also illegal to call
+ * from the corresponding SRCU read-side critical section; doing so
- * synchronize_srcu_expedited() from the corresponding SRCU read-side
+ * will result in deadlock.  However, it is perfectly legal to call
- * critical section; doing so will result in deadlock.  However, it is
+ * synchronize_srcu_expedited() on one srcu_struct from some other
- * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
+ * srcu_struct's read-side critical section.
- * from some other srcu_struct's read-side critical section, as long as
- * the resulting graph of srcu_structs is acyclic.
 */
 void synchronize_srcu_expedited(struct srcu_struct *sp)
 {
-        __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
+        __synchronize_srcu(sp, synchronize_sched_expedited);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
 /**
- * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
- */
-void srcu_barrier(struct srcu_struct *sp)
-{
-        synchronize_srcu(sp);
-}
-EXPORT_SYMBOL_GPL(srcu_barrier);
-/**
 * srcu_batches_completed - return batches completed.
 * @sp: srcu_struct on which to report batch completion.
 *
 * Report the number of batches, correlated with, but not necessarily
 * precisely the same as, the number of grace periods that have elapsed.
 */
 long srcu_batches_completed(struct srcu_struct *sp)
 {
        return sp->completed;
 }
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
-#define SRCU_CALLBACK_BATCH     10
-#define SRCU_INTERVAL           1
-/*
- * Move any new SRCU callbacks to the first stage of the SRCU grace
- * period pipeline.
- */
-static void srcu_collect_new(struct srcu_struct *sp)
-{
-        if (!rcu_batch_empty(&sp->batch_queue)) {
-                spin_lock_irq(&sp->queue_lock);
-                rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
-                spin_unlock_irq(&sp->queue_lock);
-        }
-}
-/*
- * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
- * ->batch_check1 and then to ->batch_done as readers drain.
- */
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
-{
-        int idx = 1 ^ (sp->completed & 1);
-        /*
-         * Because readers might be delayed for an extended period after
-         * fetching ->completed for their index, at any point in time there
-         * might well be readers using both idx=0 and idx=1.  We therefore
-         * need to wait for readers to clear from both index values before
-         * invoking a callback.
-         */
-        if (rcu_batch_empty(&sp->batch_check0) &&
-            rcu_batch_empty(&sp->batch_check1))
-                return; /* no callbacks need to be advanced */
-        if (!try_check_zero(sp, idx, trycount))
-                return; /* failed to advance, will try after SRCU_INTERVAL */
-        /*
-         * The callbacks in ->batch_check1 have already done with their
-         * first zero check and flip back when they were enqueued on
-         * ->batch_check0 in a previous invocation of srcu_advance_batches().
-         * (Presumably try_check_zero() returned false during that
-         * invocation, leaving the callbacks stranded on ->batch_check1.)
-         * They are therefore ready to invoke, so move them to ->batch_done.
-         */
-        rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-        if (rcu_batch_empty(&sp->batch_check0))
-                return; /* no callbacks need to be advanced */
-        srcu_flip(sp);
-        /*
-         * The callbacks in ->batch_check0 just finished their
-         * first check zero and flip, so move them to ->batch_check1
-         * for future checking on the other idx.
-         */
-        rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
-        /*
-         * SRCU read-side critical sections are normally short, so check
-         * at least twice in quick succession after a flip.
-         */
-        trycount = trycount < 2 ? 2 : trycount;
-        if (!try_check_zero(sp, idx^1, trycount))
-                return; /* failed to advance, will try after SRCU_INTERVAL */
-        /*
-         * The callbacks in ->batch_check1 have now waited for all
-         * pre-existing readers using both idx values.  They are therefore
-         * ready to invoke, so move them to ->batch_done.
-         */
-        rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-}
-/*
- * Invoke a limited number of SRCU callbacks that have passed through
- * their grace period.  If there are more to do, SRCU will reschedule
- * the workqueue.
- */
-static void srcu_invoke_callbacks(struct srcu_struct *sp)
-{
-        int i;
-        struct rcu_head *head;
-        for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
-                head = rcu_batch_dequeue(&sp->batch_done);
-                if (!head)
-                        break;
-                local_bh_disable();
-                head->func(head);
-                local_bh_enable();
-        }
-}
-/*
- * Finished one round of SRCU grace period.  Start another if there are
- * more SRCU callbacks queued, otherwise put SRCU into not-running state.
- */
-static void srcu_reschedule(struct srcu_struct *sp)
-{
-        bool pending = true;
-        if (rcu_batch_empty(&sp->batch_done) &&
-            rcu_batch_empty(&sp->batch_check1) &&
-            rcu_batch_empty(&sp->batch_check0) &&
-            rcu_batch_empty(&sp->batch_queue)) {
-                spin_lock_irq(&sp->queue_lock);
-                if (rcu_batch_empty(&sp->batch_done) &&
-                    rcu_batch_empty(&sp->batch_check1) &&
-                    rcu_batch_empty(&sp->batch_check0) &&
-                    rcu_batch_empty(&sp->batch_queue)) {
-                        sp->running = false;
-                        pending = false;
-                }
-                spin_unlock_irq(&sp->queue_lock);
-        }
-        if (pending)
-                schedule_delayed_work(&sp->work, SRCU_INTERVAL);
-}
-/*
- * This is the work-queue function that handles SRCU grace periods.
- */
-void process_srcu(struct work_struct *work)
-{
-        struct srcu_struct *sp;
-        sp = container_of(work, struct srcu_struct, work.work);
-        srcu_collect_new(sp);
-        srcu_advance_batches(sp, 1);
-        srcu_invoke_callbacks(sp);
-        srcu_reschedule(sp);
-}
-EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 00fe55cc5a8..d20c6983aad 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -7,7 +7,7 @@
 */
 #include <linux/sched.h>
 #include <linux/kernel.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2f194e96571..ba5070ce576 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -12,7 +12,7 @@
 #include <linux/cpu.h>
 #include <linux/init.h>
 #include <linux/kthread.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/stop_machine.h>
@@ -41,7 +41,6 @@ struct cpu_stopper {
 };
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
-static bool stop_machine_initialized = false;
 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
@@ -387,8 +386,6 @@ static int __init cpu_stop_init(void)
        cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
        register_cpu_notifier(&cpu_stop_cpu_notifier);
-        stop_machine_initialized = true;
        return 0;
 }
 early_initcall(cpu_stop_init);
@@ -488,25 +485,6 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
                                            .num_threads = num_online_cpus(),
                                            .active_cpus = cpus };
-        if (!stop_machine_initialized) {
-                /*
-                 * Handle the case where stop_machine() is called
-                 * early in boot before stop_machine() has been
-                 * initialized.
-                 */
-                unsigned long flags;
-                int ret;
-                WARN_ON_ONCE(smdata.num_threads != 1);
-                local_irq_save(flags);
-                hard_irq_disable();
-                ret = (*fn)(data);
-                local_irq_restore(flags);
-                return ret;
-        }
        /* Set the initial state and stop all online cpus. */
        set_state(&smdata, STOPMACHINE_PREPARE);
        return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
diff --git a/kernel/sys.c b/kernel/sys.c
index 265b3769042..1dbbe695a5e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -4,7 +4,7 @@
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
@@ -12,7 +12,6 @@
 #include <linux/prctl.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
-#include <linux/kmod.h>
 #include <linux/perf_event.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
@@ -36,8 +35,6 @@
 #include <linux/personality.h>
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
-#include <linux/file.h>
-#include <linux/mount.h>
 #include <linux/gfp.h>
 #include <linux/syscore_ops.h>
 #include <linux/version.h>
@@ -95,8 +92,10 @@
 int overflowuid = DEFAULT_OVERFLOWUID;
 int overflowgid = DEFAULT_OVERFLOWGID;
+#ifdef CONFIG_UID16
 EXPORT_SYMBOL(overflowuid);
 EXPORT_SYMBOL(overflowgid);
+#endif
 /*
 * the same as above, but for filesystems which can only store a 16-bit
@@ -133,10 +132,11 @@ static bool set_one_prio_perm(struct task_struct *p)
 {
        const struct cred *cred = current_cred(), *pcred = __task_cred(p);
-        if (uid_eq(pcred->uid,  cred->euid) ||
+        if (pcred->user->user_ns == cred->user->user_ns &&
-            uid_eq(pcred->euid, cred->euid))
+            (pcred->uid  == cred->euid ||
+             pcred->euid == cred->euid))
                return true;
-        if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
+        if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
                return true;
        return false;
 }
@@ -176,7 +176,6 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
        const struct cred *cred = current_cred();
        int error = -EINVAL;
        struct pid *pgrp;
-        kuid_t uid;
        if (which > PRIO_USER || which < PRIO_PROCESS)
                goto out;
@@ -209,19 +208,18 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case PRIO_USER:
-                        uid = make_kuid(cred->user_ns, who);
+                        user = (struct user_struct *) cred->user;
-                        user = cred->user;
                        if (!who)
-                                uid = cred->uid;
+                                who = cred->uid;
-                        else if (!uid_eq(uid, cred->uid) &&
+                        else if ((who != cred->uid) &&
-                                 !(user = find_user(uid)))
+                                 !(user = find_user(who)))
                                goto out_unlock;        /* No processes for this user */
                        do_each_thread(g, p) {
-                                if (uid_eq(task_uid(p), uid))
+                                if (__task_cred(p)->uid == who)
                                        error = set_one_prio(p, niceval, error);
                        } while_each_thread(g, p);
-                        if (!uid_eq(uid, cred->uid))
+                        if (who != cred->uid)
                                free_uid(user);         /* For find_user() */
                        break;
        }
@@ -245,7 +243,6 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
        const struct cred *cred = current_cred();
        long niceval, retval = -ESRCH;
        struct pid *pgrp;
-        kuid_t uid;
        if (which > PRIO_USER || which < PRIO_PROCESS)
                return -EINVAL;
@@ -276,22 +273,21 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case PRIO_USER:
-                        uid = make_kuid(cred->user_ns, who);
+                        user = (struct user_struct *) cred->user;
-                        user = cred->user;
                        if (!who)
-                                uid = cred->uid;
+                                who = cred->uid;
-                        else if (!uid_eq(uid, cred->uid) &&
+                        else if ((who != cred->uid) &&
-                                 !(user = find_user(uid)))
+                                 !(user = find_user(who)))
                                goto out_unlock;        /* No processes for this user */
                        do_each_thread(g, p) {
-                                if (uid_eq(task_uid(p), uid)) {
+                                if (__task_cred(p)->uid == who) {
                                        niceval = 20 - task_nice(p);
                                        if (niceval > retval)
                                                retval = niceval;
                                }
                        } while_each_thread(g, p);
-                        if (!uid_eq(uid, cred->uid))
+                        if (who != cred->uid)
                                free_uid(user);         /* for find_user() */
                        break;
        }
@@ -368,7 +364,6 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
 void kernel_restart(char *cmd)
 {
        kernel_restart_prepare(cmd);
-        disable_nonboot_cpus();
        if (!cmd)
                printk(KERN_EMERG "Restarting system.\n");
        else
@@ -448,15 +443,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                        magic2 != LINUX_REBOOT_MAGIC2C))
                return -EINVAL;
-        /*
-         * If pid namespaces are enabled and the current task is in a child
-         * pid_namespace, the command is handled by reboot_pid_ns() which will
-         * call do_exit().
-         */
-        ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
-        if (ret)
-                return ret;
        /* Instead of trying to make the power_off code look like
         * halt when pm_power_off is not set do it the easy way.
         */
@@ -557,19 +543,9 @@ void ctrl_alt_del(void)
 */
 SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 {
-        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
-        kgid_t krgid, kegid;
-        krgid = make_kgid(ns, rgid);
-        kegid = make_kgid(ns, egid);
-        if ((rgid != (gid_t) -1) && !gid_valid(krgid))
-                return -EINVAL;
-        if ((egid != (gid_t) -1) && !gid_valid(kegid))
-                return -EINVAL;
        new = prepare_creds();
        if (!new)
@@ -578,25 +554,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
        retval = -EPERM;
        if (rgid != (gid_t) -1) {
-                if (gid_eq(old->gid, krgid) ||
+                if (old->gid == rgid ||
-                    gid_eq(old->egid, krgid) ||
+                    old->egid == rgid ||
                    nsown_capable(CAP_SETGID))
-                        new->gid = krgid;
+                        new->gid = rgid;
                else
                        goto error;
        }
        if (egid != (gid_t) -1) {
-                if (gid_eq(old->gid, kegid) ||
+                if (old->gid == egid ||
-                    gid_eq(old->egid, kegid) ||
+                    old->egid == egid ||
-                    gid_eq(old->sgid, kegid) ||
+                    old->sgid == egid ||
                    nsown_capable(CAP_SETGID))
-                        new->egid = kegid;
+                        new->egid = egid;
                else
                        goto error;
        }
        if (rgid != (gid_t) -1 ||
-            (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
+            (egid != (gid_t) -1 && egid != old->gid))
                new->sgid = new->egid;
        new->fsgid = new->egid;
@@ -614,15 +590,9 @@ error:
 */
 SYSCALL_DEFINE1(setgid, gid_t, gid)
 {
-        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
-        kgid_t kgid;
-        kgid = make_kgid(ns, gid);
-        if (!gid_valid(kgid))
-                return -EINVAL;
        new = prepare_creds();
        if (!new)
@@ -631,9 +601,9 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
        retval = -EPERM;
        if (nsown_capable(CAP_SETGID))
-                new->gid = new->egid = new->sgid = new->fsgid = kgid;
+                new->gid = new->egid = new->sgid = new->fsgid = gid;
-        else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
+        else if (gid == old->gid || gid == old->sgid)
-                new->egid = new->fsgid = kgid;
+                new->egid = new->fsgid = gid;
        else
                goto error;
@@ -651,7 +621,7 @@ static int set_user(struct cred *new)
 {
        struct user_struct *new_user;
-        new_user = alloc_uid(new->uid);
+        new_user = alloc_uid(current_user_ns(), new->uid);
        if (!new_user)
                return -EAGAIN;
@@ -690,19 +660,9 @@ static int set_user(struct cred *new)
 */
 SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 {
-        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
-        kuid_t kruid, keuid;
-        kruid = make_kuid(ns, ruid);
-        keuid = make_kuid(ns, euid);
-        if ((ruid != (uid_t) -1) && !uid_valid(kruid))
-                return -EINVAL;
-        if ((euid != (uid_t) -1) && !uid_valid(keuid))
-                return -EINVAL;
        new = prepare_creds();
        if (!new)
@@ -711,29 +671,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
        retval = -EPERM;
        if (ruid != (uid_t) -1) {
-                new->uid = kruid;
+                new->uid = ruid;
-                if (!uid_eq(old->uid, kruid) &&
+                if (old->uid != ruid &&
-                    !uid_eq(old->euid, kruid) &&
+                    old->euid != ruid &&
                    !nsown_capable(CAP_SETUID))
                        goto error;
        }
        if (euid != (uid_t) -1) {
-                new->euid = keuid;
+                new->euid = euid;
-                if (!uid_eq(old->uid, keuid) &&
+                if (old->uid != euid &&
-                    !uid_eq(old->euid, keuid) &&
+                    old->euid != euid &&
-                    !uid_eq(old->suid, keuid) &&
+                    old->suid != euid &&
                    !nsown_capable(CAP_SETUID))
                        goto error;
        }
-        if (!uid_eq(new->uid, old->uid)) {
+        if (new->uid != old->uid) {
                retval = set_user(new);
                if (retval < 0)
                        goto error;
        }
        if (ruid != (uid_t) -1 ||
-            (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
+            (euid != (uid_t) -1 && euid != old->uid))
                new->suid = new->euid;
        new->fsuid = new->euid;
@@ -761,15 +721,9 @@ error:
 */
 SYSCALL_DEFINE1(setuid, uid_t, uid)
 {
-        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
-        kuid_t kuid;
-        kuid = make_kuid(ns, uid);
-        if (!uid_valid(kuid))
-                return -EINVAL;
        new = prepare_creds();
        if (!new)
@@ -778,17 +732,17 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
        retval = -EPERM;
        if (nsown_capable(CAP_SETUID)) {
-                new->suid = new->uid = kuid;
+                new->suid = new->uid = uid;
-                if (!uid_eq(kuid, old->uid)) {
+                if (uid != old->uid) {
                        retval = set_user(new);
                        if (retval < 0)
                                goto error;
                }
-        } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
+        } else if (uid != old->uid && uid != new->suid) {
                goto error;
        }
-        new->fsuid = new->euid = kuid;
+        new->fsuid = new->euid = uid;
        retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
        if (retval < 0)
@@ -808,24 +762,9 @@ error:
 */
 SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 {
-        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
-        kuid_t kruid, keuid, ksuid;
-        kruid = make_kuid(ns, ruid);
-        keuid = make_kuid(ns, euid);
-        ksuid = make_kuid(ns, suid);
-        if ((ruid != (uid_t) -1) && !uid_valid(kruid))
-                return -EINVAL;
-        if ((euid != (uid_t) -1) && !uid_valid(keuid))
-                return -EINVAL;
-        if ((suid != (uid_t) -1) && !uid_valid(ksuid))
-                return -EINVAL;
        new = prepare_creds();
        if (!new)
@@ -835,29 +774,29 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
        retval = -EPERM;
        if (!nsown_capable(CAP_SETUID)) {
-                if (ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
+                if (ruid != (uid_t) -1 && ruid != old->uid &&
-                    !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
+                    ruid != old->euid  && ruid != old->suid)
                        goto error;
-                if (euid != (uid_t) -1        && !uid_eq(keuid, old->uid) &&
+                if (euid != (uid_t) -1 && euid != old->uid &&
-                    !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
+                    euid != old->euid  && euid != old->suid)
                        goto error;
-                if (suid != (uid_t) -1        && !uid_eq(ksuid, old->uid) &&
+                if (suid != (uid_t) -1 && suid != old->uid &&
-                    !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
+                    suid != old->euid  && suid != old->suid)
                        goto error;
        }
        if (ruid != (uid_t) -1) {
-                new->uid = kruid;
+                new->uid = ruid;
-                if (!uid_eq(kruid, old->uid)) {
+                if (ruid != old->uid) {
                        retval = set_user(new);
                        if (retval < 0)
                                goto error;
                }
        }
        if (euid != (uid_t) -1)
-                new->euid = keuid;
+                new->euid = euid;
        if (suid != (uid_t) -1)
-                new->suid = ksuid;
+                new->suid = suid;
        new->fsuid = new->euid;
        retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
@@ -871,19 +810,14 @@ error:
        return retval;
 }
-SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
+SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid)
 {
        const struct cred *cred = current_cred();
        int retval;
-        uid_t ruid, euid, suid;
-        ruid = from_kuid_munged(cred->user_ns, cred->uid);
-        euid = from_kuid_munged(cred->user_ns, cred->euid);
-        suid = from_kuid_munged(cred->user_ns, cred->suid);
-        if (!(retval   = put_user(ruid, ruidp)) &&
+        if (!(retval   = put_user(cred->uid,  ruid)) &&
-            !(retval   = put_user(euid, euidp)))
+            !(retval   = put_user(cred->euid, euid)))
-                retval = put_user(suid, suidp);
+                retval = put_user(cred->suid, suid);
        return retval;
 }
@@ -893,22 +827,9 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
 */
 SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
 {
-        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
-        kgid_t krgid, kegid, ksgid;
-        krgid = make_kgid(ns, rgid);
-        kegid = make_kgid(ns, egid);
-        ksgid = make_kgid(ns, sgid);
-        if ((rgid != (gid_t) -1) && !gid_valid(krgid))
-                return -EINVAL;
-        if ((egid != (gid_t) -1) && !gid_valid(kegid))
-                return -EINVAL;
-        if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
-                return -EINVAL;
        new = prepare_creds();
        if (!new)
@@ -917,23 +838,23 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
        retval = -EPERM;
        if (!nsown_capable(CAP_SETGID)) {
-                if (rgid != (gid_t) -1        && !gid_eq(krgid, old->gid) &&
+                if (rgid != (gid_t) -1 && rgid != old->gid &&
-                    !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
+                    rgid != old->egid  && rgid != old->sgid)
                        goto error;
-                if (egid != (gid_t) -1        && !gid_eq(kegid, old->gid) &&
+                if (egid != (gid_t) -1 && egid != old->gid &&
-                    !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
+                    egid != old->egid  && egid != old->sgid)
                        goto error;
-                if (sgid != (gid_t) -1        && !gid_eq(ksgid, old->gid) &&
+                if (sgid != (gid_t) -1 && sgid != old->gid &&
-                    !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
+                    sgid != old->egid  && sgid != old->sgid)
                        goto error;
        }
        if (rgid != (gid_t) -1)
-                new->gid = krgid;
+                new->gid = rgid;
        if (egid != (gid_t) -1)
-                new->egid = kegid;
+                new->egid = egid;
        if (sgid != (gid_t) -1)
-                new->sgid = ksgid;
+                new->sgid = sgid;
        new->fsgid = new->egid;
        return commit_creds(new);
@@ -943,19 +864,14 @@ error:
        return retval;
 }
-SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
+SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid)
 {
        const struct cred *cred = current_cred();
        int retval;
-        gid_t rgid, egid, sgid;
-        rgid = from_kgid_munged(cred->user_ns, cred->gid);
-        egid = from_kgid_munged(cred->user_ns, cred->egid);
-        sgid = from_kgid_munged(cred->user_ns, cred->sgid);
-        if (!(retval   = put_user(rgid, rgidp)) &&
+        if (!(retval   = put_user(cred->gid,  rgid)) &&
-            !(retval   = put_user(egid, egidp)))
+            !(retval   = put_user(cred->egid, egid)))
-                retval = put_user(sgid, sgidp);
+                retval = put_user(cred->sgid, sgid);
        return retval;
 }
@@ -972,24 +888,18 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
        const struct cred *old;
        struct cred *new;
        uid_t old_fsuid;
-        kuid_t kuid;
-        old = current_cred();
-        old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
-        kuid = make_kuid(old->user_ns, uid);
-        if (!uid_valid(kuid))
-                return old_fsuid;
        new = prepare_creds();
        if (!new)
-                return old_fsuid;
+                return current_fsuid();
+        old = current_cred();
+        old_fsuid = old->fsuid;
-        if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
+        if (uid == old->uid  || uid == old->euid  ||
-            uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
+            uid == old->suid || uid == old->fsuid ||
            nsown_capable(CAP_SETUID)) {
-                if (!uid_eq(kuid, old->fsuid)) {
+                if (uid != old_fsuid) {
-                        new->fsuid = kuid;
+                        new->fsuid = uid;
                        if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
                                goto change_okay;
                }
@@ -1011,24 +921,18 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
        const struct cred *old;
        struct cred *new;
        gid_t old_fsgid;
-        kgid_t kgid;
-        old = current_cred();
-        old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
-        kgid = make_kgid(old->user_ns, gid);
-        if (!gid_valid(kgid))
-                return old_fsgid;
        new = prepare_creds();
        if (!new)
-                return old_fsgid;
+                return current_fsgid();
+        old = current_cred();
+        old_fsgid = old->fsgid;
-        if (gid_eq(kgid, old->gid)  || gid_eq(kgid, old->egid)  ||
+        if (gid == old->gid  || gid == old->egid  ||
-            gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
+            gid == old->sgid || gid == old->fsgid ||
            nsown_capable(CAP_SETGID)) {
-                if (!gid_eq(kgid, old->fsgid)) {
+                if (gid != old_fsgid) {
-                        new->fsgid = kgid;
+                        new->fsgid = gid;
                        goto change_okay;
                }
        }
@@ -1046,7 +950,7 @@ void do_sys_times(struct tms *tms)
        cputime_t tgutime, tgstime, cutime, cstime;
        spin_lock_irq(&current->sighand->siglock);
-        thread_group_cputime_adjusted(current, &tgutime, &tgstime);
+        thread_group_times(current, &tgutime, &tgstime);
        cutime = current->signal->cutime;
        cstime = current->signal->cstime;
        spin_unlock_irq(&current->sighand->siglock);
@@ -1265,16 +1169,15 @@ DECLARE_RWSEM(uts_sem);
 * Work around broken programs that cannot handle "Linux 3.0".
 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
 */
-static int override_release(char __user *release, size_t len)
+static int override_release(char __user *release, int len)
 {
        int ret = 0;
+        char buf[65];
        if (current->personality & UNAME26) {
-                const char *rest = UTS_RELEASE;
+                char *rest = UTS_RELEASE;
-                char buf[65] = { 0 };
                int ndots = 0;
                unsigned v;
-                size_t copy;
                while (*rest) {
                        if (*rest == '.' && ++ndots >= 3)
@@ -1284,9 +1187,8 @@ static int override_release(char __user *release, size_t len)
                        rest++;
                }
                v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
-                copy = clamp_t(size_t, len, 1, sizeof(buf));
+                snprintf(buf, len, "2.6.%u%s", v, rest);
-                copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
+                ret = copy_to_user(release, buf, len);
-                ret = copy_to_user(release, buf, copy + 1);
        }
        return ret;
 }
@@ -1383,7 +1285,6 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
                memcpy(u->nodename, tmp, len);
                memset(u->nodename + len, 0, sizeof(u->nodename) - len);
                errno = 0;
-                uts_proc_notify(UTS_PROC_HOSTNAME);
        }
        up_write(&uts_sem);
        return errno;
@@ -1434,7 +1335,6 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
                memcpy(u->domainname, tmp, len);
                memset(u->domainname + len, 0, sizeof(u->domainname) - len);
                errno = 0;
-                uts_proc_notify(UTS_PROC_DOMAINNAME);
        }
        up_write(&uts_sem);
        return errno;
@@ -1586,14 +1486,15 @@ static int check_prlimit_permission(struct task_struct *task)
                return 0;
        tcred = __task_cred(task);
-        if (uid_eq(cred->uid, tcred->euid) &&
+        if (cred->user->user_ns == tcred->user->user_ns &&
-            uid_eq(cred->uid, tcred->suid) &&
+            (cred->uid == tcred->euid &&
-            uid_eq(cred->uid, tcred->uid)  &&
+             cred->uid == tcred->suid &&
-            gid_eq(cred->gid, tcred->egid) &&
+             cred->uid == tcred->uid  &&
-            gid_eq(cred->gid, tcred->sgid) &&
+             cred->gid == tcred->egid &&
-            gid_eq(cred->gid, tcred->gid))
+             cred->gid == tcred->sgid &&
+             cred->gid == tcred->gid))
                return 0;
-        if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
+        if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
                return 0;
        return -EPERM;
@@ -1701,10 +1602,10 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        unsigned long maxrss = 0;
        memset((char *) r, 0, sizeof *r);
-        utime = stime = 0;
+        utime = stime = cputime_zero;
        if (who == RUSAGE_THREAD) {
-                task_cputime_adjusted(current, &utime, &stime);
+                task_times(current, &utime, &stime);
                accumulate_thread_rusage(p, r);
                maxrss = p->signal->maxrss;
                goto out;
@@ -1730,9 +1631,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                                break;
                case RUSAGE_SELF:
-                        thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+                        thread_group_times(p, &tgutime, &tgstime);
-                        utime += tgutime;
+                        utime = cputime_add(utime, tgutime);
-                        stime += tgstime;
+                        stime = cputime_add(stime, tgstime);
                        r->ru_nvcsw += p->signal->nvcsw;
                        r->ru_nivcsw += p->signal->nivcsw;
                        r->ru_minflt += p->signal->min_flt;
@@ -1788,217 +1689,6 @@ SYSCALL_DEFINE1(umask, int, mask)
        return mask;
 }
-#ifdef CONFIG_CHECKPOINT_RESTORE
-static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
-{
-        struct fd exe;
-        struct dentry *dentry;
-        int err;
-        exe = fdget(fd);
-        if (!exe.file)
-                return -EBADF;
-        dentry = exe.file->f_path.dentry;
-        /*
-         * Because the original mm->exe_file points to executable file, make
-         * sure that this one is executable as well, to avoid breaking an
-         * overall picture.
-         */
-        err = -EACCES;
-        if (!S_ISREG(dentry->d_inode->i_mode)   ||
-            exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
-                goto exit;
-        err = inode_permission(dentry->d_inode, MAY_EXEC);
-        if (err)
-                goto exit;
-        down_write(&mm->mmap_sem);
-        /*
-         * Forbid mm->exe_file change if old file still mapped.
-         */
-        err = -EBUSY;
-        if (mm->exe_file) {
-                struct vm_area_struct *vma;
-                for (vma = mm->mmap; vma; vma = vma->vm_next)
-                        if (vma->vm_file &&
-                            path_equal(&vma->vm_file->f_path,
-                                       &mm->exe_file->f_path))
-                                goto exit_unlock;
-        }
-        /*
-         * The symlink can be changed only once, just to disallow arbitrary
-         * transitions malicious software might bring in. This means one
-         * could make a snapshot over all processes running and monitor
-         * /proc/pid/exe changes to notice unusual activity if needed.
-         */
-        err = -EPERM;
-        if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
-                goto exit_unlock;
-        err = 0;
-        set_mm_exe_file(mm, exe.file);  /* this grabs a reference to exe.file */
-exit_unlock:
-        up_write(&mm->mmap_sem);
-exit:
-        fdput(exe);
-        return err;
-}
-static int prctl_set_mm(int opt, unsigned long addr,
-                        unsigned long arg4, unsigned long arg5)
-{
-        unsigned long rlim = rlimit(RLIMIT_DATA);
-        struct mm_struct *mm = current->mm;
-        struct vm_area_struct *vma;
-        int error;
-        if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
-                return -EINVAL;
-        if (!capable(CAP_SYS_RESOURCE))
-                return -EPERM;
-        if (opt == PR_SET_MM_EXE_FILE)
-                return prctl_set_mm_exe_file(mm, (unsigned int)addr);
-        if (addr >= TASK_SIZE || addr < mmap_min_addr)
-                return -EINVAL;
-        error = -EINVAL;
-        down_read(&mm->mmap_sem);
-        vma = find_vma(mm, addr);
-        switch (opt) {
-        case PR_SET_MM_START_CODE:
-                mm->start_code = addr;
-                break;
-        case PR_SET_MM_END_CODE:
-                mm->end_code = addr;
-                break;
-        case PR_SET_MM_START_DATA:
-                mm->start_data = addr;
-                break;
-        case PR_SET_MM_END_DATA:
-                mm->end_data = addr;
-                break;
-        case PR_SET_MM_START_BRK:
-                if (addr <= mm->end_data)
-                        goto out;
-                if (rlim < RLIM_INFINITY &&
-                    (mm->brk - addr) +
-                    (mm->end_data - mm->start_data) > rlim)
-                        goto out;
-                mm->start_brk = addr;
-                break;
-        case PR_SET_MM_BRK:
-                if (addr <= mm->end_data)
-                        goto out;
-                if (rlim < RLIM_INFINITY &&
-                    (addr - mm->start_brk) +
-                    (mm->end_data - mm->start_data) > rlim)
-                        goto out;
-                mm->brk = addr;
-                break;
-        /*
-         * If command line arguments and environment
-         * are placed somewhere else on stack, we can
-         * set them up here, ARG_START/END to setup
-         * command line argumets and ENV_START/END
-         * for environment.
-         */
-        case PR_SET_MM_START_STACK:
-        case PR_SET_MM_ARG_START:
-        case PR_SET_MM_ARG_END:
-        case PR_SET_MM_ENV_START:
-        case PR_SET_MM_ENV_END:
-                if (!vma) {
-                        error = -EFAULT;
-                        goto out;
-                }
-                if (opt == PR_SET_MM_START_STACK)
-                        mm->start_stack = addr;
-                else if (opt == PR_SET_MM_ARG_START)
-                        mm->arg_start = addr;
-                else if (opt == PR_SET_MM_ARG_END)
-                        mm->arg_end = addr;
-                else if (opt == PR_SET_MM_ENV_START)
-                        mm->env_start = addr;
-                else if (opt == PR_SET_MM_ENV_END)
-                        mm->env_end = addr;
-                break;
-        /*
-         * This doesn't move auxiliary vector itself
-         * since it's pinned to mm_struct, but allow
-         * to fill vector with new values. It's up
-         * to a caller to provide sane values here
-         * otherwise user space tools which use this
-         * vector might be unhappy.
-         */
-        case PR_SET_MM_AUXV: {
-                unsigned long user_auxv[AT_VECTOR_SIZE];
-                if (arg4 > sizeof(user_auxv))
-                        goto out;
-                up_read(&mm->mmap_sem);
-                if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
-                        return -EFAULT;
-                /* Make sure the last entry is always AT_NULL */
-                user_auxv[AT_VECTOR_SIZE - 2] = 0;
-                user_auxv[AT_VECTOR_SIZE - 1] = 0;
-                BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
-                task_lock(current);
-                memcpy(mm->saved_auxv, user_auxv, arg4);
-                task_unlock(current);
-                return 0;
-        }
-        default:
-                goto out;
-        }
-        error = 0;
-out:
-        up_read(&mm->mmap_sem);
-        return error;
-}
-static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
-{
-        return put_user(me->clear_child_tid, tid_addr);
-}
-#else /* CONFIG_CHECKPOINT_RESTORE */
-static int prctl_set_mm(int opt, unsigned long addr,
-                        unsigned long arg4, unsigned long arg5)
-{
-        return -EINVAL;
-}
-static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
-{
-        return -EINVAL;
-}
-#endif
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                unsigned long, arg4, unsigned long, arg5)
 {
@@ -2018,6 +1708,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                break;
                        }
                        me->pdeath_signal = arg2;
+                        error = 0;
                        break;
                case PR_GET_PDEATHSIG:
                        error = put_user(me->pdeath_signal, (int __user *)arg2);
@@ -2031,6 +1722,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                break;
                        }
                        set_dumpable(me->mm, arg2);
+                        error = 0;
                        break;
                case PR_SET_UNALIGN:
@@ -2057,32 +1749,35 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                case PR_SET_TIMING:
                        if (arg2 != PR_TIMING_STATISTICAL)
                                error = -EINVAL;
+                        else
+                                error = 0;
                        break;
                case PR_SET_NAME:
                        comm[sizeof(me->comm)-1] = 0;
                        if (strncpy_from_user(comm, (char __user *)arg2,
                                              sizeof(me->comm) - 1) < 0)
                                return -EFAULT;
                        set_task_comm(me, comm);
-                        proc_comm_connector(me);
+                        return 0;
-                        break;
                case PR_GET_NAME:
                        get_task_comm(comm, me);
                        if (copy_to_user((char __user *)arg2, comm,
                                         sizeof(comm)))
                                return -EFAULT;
-                        break;
+                        return 0;
                case PR_GET_ENDIAN:
                        error = GET_ENDIAN(me, arg2);
                        break;
                case PR_SET_ENDIAN:
                        error = SET_ENDIAN(me, arg2);
                        break;
                case PR_GET_SECCOMP:
                        error = prctl_get_seccomp();
                        break;
                case PR_SET_SECCOMP:
-                        error = prctl_set_seccomp(arg2, (char __user *)arg3);
+                        error = prctl_set_seccomp(arg2);
                        break;
                case PR_GET_TSC:
                        error = GET_TSC_CTL(arg2);
@@ -2105,6 +1800,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                        current->default_timer_slack_ns;
                        else
                                current->timer_slack_ns = arg2;
+                        error = 0;
                        break;
                case PR_MCE_KILL:
                        if (arg4 | arg5)
@@ -2130,6 +1826,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        default:
                                return -EINVAL;
                        }
+                        error = 0;
                        break;
                case PR_MCE_KILL_GET:
                        if (arg2 | arg3 | arg4 | arg5)
@@ -2140,29 +1837,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        else
                                error = PR_MCE_KILL_DEFAULT;
                        break;
-                case PR_SET_MM:
-                        error = prctl_set_mm(arg2, arg3, arg4, arg5);
-                        break;
-                case PR_GET_TID_ADDRESS:
-                        error = prctl_get_tid_address(me, (int __user **)arg2);
-                        break;
-                case PR_SET_CHILD_SUBREAPER:
-                        me->signal->is_child_subreaper = !!arg2;
-                        break;
-                case PR_GET_CHILD_SUBREAPER:
-                        error = put_user(me->signal->is_child_subreaper,
-                                         (int __user *) arg2);
-                        break;
-                case PR_SET_NO_NEW_PRIVS:
-                        if (arg2 != 1 || arg3 || arg4 || arg5)
-                                return -EINVAL;
-                        current->no_new_privs = 1;
-                        break;
-                case PR_GET_NO_NEW_PRIVS:
-                        if (arg2 || arg3 || arg4 || arg5)
-                                return -EINVAL;
-                        return current->no_new_privs ? 1 : 0;
                default:
                        error = -EINVAL;
                        break;
@@ -2189,52 +1863,49 @@ static void argv_cleanup(struct subprocess_info *info)
        argv_free(info->argv);
 }
-static int __orderly_poweroff(void)
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+int orderly_poweroff(bool force)
 {
        int argc;
-        char **argv;
+        char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
        static char *envp[] = {
                "HOME=/",
                "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
                NULL
        };
-        int ret;
+        int ret = -ENOMEM;
+        struct subprocess_info *info;
-        argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
        if (argv == NULL) {
                printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
                       __func__, poweroff_cmd);
-                return -ENOMEM;
+                goto out;
        }
-        ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
+        info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
-                                      NULL, argv_cleanup, NULL);
+        if (info == NULL) {
-        if (ret == -ENOMEM)
                argv_free(argv);
+                goto out;
+        }
-        return ret;
+        call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
-}
-/**
+        ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
- * orderly_poweroff - Trigger an orderly system poweroff
- * @force: force poweroff if command execution fails
- *
- * This may be called from any context to trigger a system shutdown.
- * If the orderly shutdown fails, it will force an immediate shutdown.
- */
-int orderly_poweroff(bool force)
-{
-        int ret = __orderly_poweroff();
+  out:
        if (ret && force) {
                printk(KERN_WARNING "Failed to start orderly shutdown: "
                       "forcing the issue\n");
-                /*
+                /* I guess this should try to kick off some daemon to
-                 * I guess this should try to kick off some daemon to sync and
+                   sync and poweroff asap.  Or not even bother syncing
-                 * poweroff asap.  Or not even bother syncing if we're doing an
+                   if we're doing an emergency shutdown? */
-                 * emergency shutdown?
-                 */
                emergency_sync();
                kernel_power_off();
        }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 395084d4ce1..a9a5de07c4f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,7 +25,6 @@ cond_syscall(sys_swapoff);
 cond_syscall(sys_kexec_load);
 cond_syscall(compat_sys_kexec_load);
 cond_syscall(sys_init_module);
-cond_syscall(sys_finit_module);
 cond_syscall(sys_delete_module);
 cond_syscall(sys_socketpair);
 cond_syscall(sys_bind);
@@ -146,10 +145,6 @@ cond_syscall(sys_io_submit);
 cond_syscall(sys_io_cancel);
 cond_syscall(sys_io_getevents);
 cond_syscall(sys_syslog);
-cond_syscall(sys_process_vm_readv);
-cond_syscall(sys_process_vm_writev);
-cond_syscall(compat_sys_process_vm_readv);
-cond_syscall(compat_sys_process_vm_writev);
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
@@ -204,6 +199,3 @@ cond_syscall(sys_fanotify_mark);
 cond_syscall(sys_name_to_handle_at);
 cond_syscall(sys_open_by_handle_at);
 cond_syscall(compat_sys_open_by_handle_at);
-/* compare kernel pointers */
-cond_syscall(sys_kcmp);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c88878db491..fd15163f360 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,14 +23,12 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
-#include <linux/bitmap.h>
 #include <linux/signal.h>
 #include <linux/printk.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
 #include <linux/kmemcheck.h>
-#include <linux/kmemleak.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -59,8 +57,6 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/kmod.h>
-#include <linux/capability.h>
-#include <linux/binfmts.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -70,9 +66,6 @@
 #include <asm/stacktrace.h>
 #include <asm/io.h>
 #endif
-#ifdef CONFIG_SPARC
-#include <asm/setup.h>
-#endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 #include <linux/acct.h>
 #endif
@@ -97,14 +90,13 @@
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern int max_threads;
-extern int suid_dumpable;
-#ifdef CONFIG_COREDUMP
 extern int core_uses_pid;
+extern int suid_dumpable;
 extern char core_pattern[];
 extern unsigned int core_pipe_limit;
-#endif
 extern int pid_max;
 extern int min_free_kbytes;
+extern int min_free_order_shift;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -143,12 +135,12 @@ static int minolduid;
 static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
-static const int cap_last_cap = CAP_LAST_CAP;
 #ifdef CONFIG_INOTIFY_USER
 #include <linux/inotify.h>
 #endif
 #ifdef CONFIG_SPARC
+#include <asm/system.h>
 #endif
 #ifdef CONFIG_SPARC64
@@ -160,6 +152,14 @@ extern int pwrsw_enabled;
 extern int unaligned_enabled;
 #endif
+#ifdef CONFIG_S390
+#ifdef CONFIG_MATHEMU
+extern int sysctl_ieee_emulation_warnings;
+#endif
+extern int sysctl_userprocess_debug;
+extern int spin_retry;
+#endif
 #ifdef CONFIG_IA64
 extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
@@ -173,17 +173,10 @@ static int proc_taint(struct ctl_table *table, int write,
 #endif
 #ifdef CONFIG_PRINTK
-static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
-static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
-                void __user *buffer, size_t *lenp, loff_t *ppos);
-#ifdef CONFIG_COREDUMP
-static int proc_dostring_coredump(struct ctl_table *table, int write,
-                void __user *buffer, size_t *lenp, loff_t *ppos);
-#endif
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
 static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -206,6 +199,20 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
 #endif
+static struct ctl_table root_table[];
+static struct ctl_table_root sysctl_table_root;
+static struct ctl_table_header root_table_header = {
+        {{.count = 1,
+        .ctl_table = root_table,
+        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
+        .root = &sysctl_table_root,
+        .set = &sysctl_table_root.default_set,
+};
+static struct ctl_table_root sysctl_table_root = {
+        .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
+        .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+};
 static struct ctl_table kern_table[];
 static struct ctl_table vm_table[];
 static struct ctl_table fs_table[];
@@ -222,7 +229,7 @@ int sysctl_legacy_va_layout;
 /* The default sysctl tables: */
-static struct ctl_table sysctl_base_table[] = {
+static struct ctl_table root_table[] = {
        {
                .procname       = "kernel",
                .mode           = 0555,
@@ -256,11 +263,9 @@ static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;     /* 1 second */
 static int min_wakeup_granularity_ns;                   /* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;    /* 1 second */
-#ifdef CONFIG_SMP
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif /* CONFIG_SMP */
+#endif
-#endif /* CONFIG_SCHED_DEBUG */
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
@@ -303,7 +308,6 @@ static struct ctl_table kern_table[] = {
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
-#ifdef CONFIG_SMP
        {
                .procname       = "sched_tunable_scaling",
                .data           = &sysctl_sched_tunable_scaling,
@@ -314,7 +318,7 @@ static struct ctl_table kern_table[] = {
                .extra2         = &max_sched_tunable_scaling,
        },
        {
-                .procname       = "sched_migration_cost_ns",
+                .procname       = "sched_migration_cost",
                .data           = &sysctl_sched_migration_cost,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
@@ -328,14 +332,14 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
-                .procname       = "sched_time_avg_ms",
+                .procname       = "sched_time_avg",
                .data           = &sysctl_sched_time_avg,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
        {
-                .procname       = "sched_shares_window_ns",
+                .procname       = "sched_shares_window",
                .data           = &sysctl_sched_shares_window,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
@@ -350,45 +354,7 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
-#endif /* CONFIG_SMP */
+#endif
-#ifdef CONFIG_NUMA_BALANCING
-        {
-                .procname       = "numa_balancing_scan_delay_ms",
-                .data           = &sysctl_numa_balancing_scan_delay,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
-                .procname       = "numa_balancing_scan_period_min_ms",
-                .data           = &sysctl_numa_balancing_scan_period_min,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
-                .procname       = "numa_balancing_scan_period_reset",
-                .data           = &sysctl_numa_balancing_scan_period_reset,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
-                .procname       = "numa_balancing_scan_period_max_ms",
-                .data           = &sysctl_numa_balancing_scan_period_max,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
-                .procname       = "numa_balancing_scan_size_mb",
-                .data           = &sysctl_numa_balancing_scan_size,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
        {
                .procname       = "sched_rt_period_us",
                .data           = &sysctl_sched_rt_period,
@@ -414,16 +380,6 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
 #endif
-#ifdef CONFIG_CFS_BANDWIDTH
-        {
-                .procname       = "sched_cfs_bandwidth_slice_us",
-                .data           = &sysctl_sched_cfs_bandwidth_slice,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &one,
-        },
-#endif
 #ifdef CONFIG_PROVE_LOCKING
        {
                .procname       = "prove_locking",
@@ -449,7 +405,6 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
-#ifdef CONFIG_COREDUMP
        {
                .procname       = "core_uses_pid",
                .data           = &core_uses_pid,
@@ -462,7 +417,7 @@ static struct ctl_table kern_table[] = {
                .data           = core_pattern,
                .maxlen         = CORENAME_MAX_SIZE,
                .mode           = 0644,
-                .proc_handler   = proc_dostring_coredump,
+                .proc_handler   = proc_dostring,
        },
        {
                .procname       = "core_pipe_limit",
@@ -471,7 +426,6 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
-#endif
 #ifdef CONFIG_PROC_SYSCTL
        {
                .procname       = "tainted",
@@ -606,7 +560,7 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
 #endif
+#ifdef CONFIG_HOTPLUG
        {
                .procname       = "hotplug",
                .data           = &uevent_helper,
@@ -614,7 +568,7 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dostring,
        },
+#endif
 #ifdef CONFIG_CHR_DEV_SG
        {
                .procname       = "sg-big-buff",
@@ -756,7 +710,7 @@ static struct ctl_table kern_table[] = {
                .data           = &dmesg_restrict,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax_sysadmin,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &zero,
                .extra2         = &one,
        },
@@ -765,7 +719,7 @@ static struct ctl_table kern_table[] = {
                .data           = &kptr_restrict,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax_sysadmin,
+                .proc_handler   = proc_dmesg_restrict,
                .extra1         = &zero,
                .extra2         = &two,
        },
@@ -777,13 +731,6 @@ static struct ctl_table kern_table[] = {
                .mode           = 0444,
                .proc_handler   = proc_dointvec,
        },
-        {
-                .procname       = "cap_last_cap",
-                .data           = (void *)&cap_last_cap,
-                .maxlen         = sizeof(int),
-                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
-        },
 #if defined(CONFIG_LOCKUP_DETECTOR)
        {
                .procname       = "watchdog",
@@ -846,15 +793,6 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
-        {
-                .procname       = "panic_on_stackoverflow",
-                .data           = &sysctl_panic_on_stackoverflow,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-#endif
        {
                .procname       = "bootloader_type",
                .data           = &bootloader_type,
@@ -1148,9 +1086,11 @@ static struct ctl_table vm_table[] = {
                .extra1         = &zero,
        },
        {
-                .procname       = "nr_pdflush_threads",
+                .procname       = "nr_pdflush_threads",
-                .mode           = 0444 /* read-only */,
+                .data           = &nr_pdflush_threads,
-                .proc_handler   = pdflush_proc_obsolete,
+                .maxlen         = sizeof nr_pdflush_threads,
+                .mode           = 0444 /* read-only*/,
+                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "swappiness",
@@ -1250,6 +1190,13 @@ static struct ctl_table vm_table[] = {
                .extra1         = &zero,
        },
        {
+                .procname       = "min_free_order_shift",
+                .data           = &min_free_order_shift,
+                .maxlen         = sizeof(min_free_order_shift),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
+        {
                .procname       = "percpu_pagelist_fraction",
                .data           = &percpu_pagelist_fraction,
                .maxlen         = sizeof(percpu_pagelist_fraction),
@@ -1545,29 +1492,11 @@ static struct ctl_table fs_table[] = {
 #endif
 #endif
        {
-                .procname       = "protected_symlinks",
-                .data           = &sysctl_protected_symlinks,
-                .maxlen         = sizeof(int),
-                .mode           = 0600,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &zero,
-                .extra2         = &one,
-        },
-        {
-                .procname       = "protected_hardlinks",
-                .data           = &sysctl_protected_hardlinks,
-                .maxlen         = sizeof(int),
-                .mode           = 0600,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &zero,
-                .extra2         = &one,
-        },
-        {
                .procname       = "suid_dumpable",
                .data           = &suid_dumpable,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax_coredump,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &zero,
                .extra2         = &two,
        },
@@ -1590,7 +1519,8 @@ static struct ctl_table fs_table[] = {
 };
 static struct ctl_table debug_table[] = {
-#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
+#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
+    defined(CONFIG_S390) || defined(CONFIG_TILE)
        {
                .procname       = "exception-trace",
                .data           = &show_unhandled_signals,
@@ -1617,15 +1547,490 @@ static struct ctl_table dev_table[] = {
        { }
 };
-int __init sysctl_init(void)
+static DEFINE_SPINLOCK(sysctl_lock);
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+        if (unlikely(p->unregistering))
+                return 0;
+        p->used++;
+        return 1;
+}
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+        if (!--p->used)
+                if (unlikely(p->unregistering))
+                        complete(p->unregistering);
+}
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+        /*
+         * if p->used is 0, nobody will ever touch that entry again;
+         * we'll eliminate all paths to it before dropping sysctl_lock
+         */
+        if (unlikely(p->used)) {
+                struct completion wait;
+                init_completion(&wait);
+                p->unregistering = &wait;
+                spin_unlock(&sysctl_lock);
+                wait_for_completion(&wait);
+                spin_lock(&sysctl_lock);
+        } else {
+                /* anything non-NULL; we'll never dereference it */
+                p->unregistering = ERR_PTR(-EINVAL);
+        }
+        /*
+         * do not remove from the list until nobody holds it; walking the
+         * list in do_sysctl() relies on that.
+         */
+        list_del_init(&p->ctl_entry);
+}
+void sysctl_head_get(struct ctl_table_header *head)
+{
+        spin_lock(&sysctl_lock);
+        head->count++;
+        spin_unlock(&sysctl_lock);
+}
+void sysctl_head_put(struct ctl_table_header *head)
+{
+        spin_lock(&sysctl_lock);
+        if (!--head->count)
+                kfree_rcu(head, rcu);
+        spin_unlock(&sysctl_lock);
+}
+struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+        if (!head)
+                BUG();
+        spin_lock(&sysctl_lock);
+        if (!use_table(head))
+                head = ERR_PTR(-ENOENT);
+        spin_unlock(&sysctl_lock);
+        return head;
+}
+void sysctl_head_finish(struct ctl_table_header *head)
+{
+        if (!head)
+                return;
+        spin_lock(&sysctl_lock);
+        unuse_table(head);
+        spin_unlock(&sysctl_lock);
+}
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+        struct ctl_table_set *set = &root->default_set;
+        if (root->lookup)
+                set = root->lookup(root, namespaces);
+        return set;
+}
+static struct list_head *
+lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+        struct ctl_table_set *set = lookup_header_set(root, namespaces);
+        return &set->list;
+}
+struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
+                                            struct ctl_table_header *prev)
+{
+        struct ctl_table_root *root;
+        struct list_head *header_list;
+        struct ctl_table_header *head;
+        struct list_head *tmp;
+        spin_lock(&sysctl_lock);
+        if (prev) {
+                head = prev;
+                tmp = &prev->ctl_entry;
+                unuse_table(prev);
+                goto next;
+        }
+        tmp = &root_table_header.ctl_entry;
+        for (;;) {
+                head = list_entry(tmp, struct ctl_table_header, ctl_entry);
+                if (!use_table(head))
+                        goto next;
+                spin_unlock(&sysctl_lock);
+                return head;
+        next:
+                root = head->root;
+                tmp = tmp->next;
+                header_list = lookup_header_list(root, namespaces);
+                if (tmp != header_list)
+                        continue;
+                do {
+                        root = list_entry(root->root_list.next,
+                                        struct ctl_table_root, root_list);
+                        if (root == &sysctl_table_root)
+                                goto out;
+                        header_list = lookup_header_list(root, namespaces);
+                } while (list_empty(header_list));
+                tmp = header_list->next;
+        }
+out:
+        spin_unlock(&sysctl_lock);
+        return NULL;
+}
+struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
+{
+        return __sysctl_head_next(current->nsproxy, prev);
+}
+void register_sysctl_root(struct ctl_table_root *root)
+{
+        spin_lock(&sysctl_lock);
+        list_add_tail(&root->root_list, &sysctl_table_root.root_list);
+        spin_unlock(&sysctl_lock);
+}
+/*
+ * sysctl_perm does NOT grant the superuser all rights automatically, because
+ * some sysctl variables are readonly even to root.
+ */
+static int test_perm(int mode, int op)
+{
+        if (!current_euid())
+                mode >>= 6;
+        else if (in_egroup_p(0))
+                mode >>= 3;
+        if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
+                return 0;
+        return -EACCES;
+}
+int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
+{
+        int mode;
+        if (root->permissions)
+                mode = root->permissions(root, current->nsproxy, table);
+        else
+                mode = table->mode;
+        return test_perm(mode, op);
+}
+static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 {
-        struct ctl_table_header *hdr;
+        for (; table->procname; table++) {
+                table->parent = parent;
+                if (table->child)
+                        sysctl_set_parent(table, table->child);
+        }
+}
-        hdr = register_sysctl_table(sysctl_base_table);
+static __init int sysctl_init(void)
-        kmemleak_not_leak(hdr);
+{
+        sysctl_set_parent(NULL, root_table);
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
+        sysctl_check_table(current->nsproxy, root_table);
+#endif
        return 0;
 }
+core_initcall(sysctl_init);
+static struct ctl_table *is_branch_in(struct ctl_table *branch,
+                                      struct ctl_table *table)
+{
+        struct ctl_table *p;
+        const char *s = branch->procname;
+        /* branch should have named subdirectory as its first element */
+        if (!s || !branch->child)
+                return NULL;
+        /* ... and nothing else */
+        if (branch[1].procname)
+                return NULL;
+        /* table should contain subdirectory with the same name */
+        for (p = table; p->procname; p++) {
+                if (!p->child)
+                        continue;
+                if (p->procname && strcmp(p->procname, s) == 0)
+                        return p;
+        }
+        return NULL;
+}
+/* see if attaching q to p would be an improvement */
+static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
+{
+        struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
+        struct ctl_table *next;
+        int is_better = 0;
+        int not_in_parent = !p->attached_by;
+        while ((next = is_branch_in(by, to)) != NULL) {
+                if (by == q->attached_by)
+                        is_better = 1;
+                if (to == p->attached_by)
+                        not_in_parent = 1;
+                by = by->child;
+                to = next->child;
+        }
+        if (is_better && not_in_parent) {
+                q->attached_by = by;
+                q->attached_to = to;
+                q->parent = p;
+        }
+}
+/**
+ * __register_sysctl_paths - register a sysctl hierarchy
+ * @root: List of sysctl headers to register on
+ * @namespaces: Data to compute which lists of sysctl entries are visible
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * The members of the &struct ctl_table structure are used as follows:
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ *            enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file, and for sysctl(2)
+ *
+ * child - a pointer to the child sysctl table if this entry is a directory, or
+ *         %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * de - for internal use by the sysctl routines
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.
+ *
+ * sysctl(2) can automatically manage read and write requests through
+ * the sysctl table.  The data and maxlen fields of the ctl_table
+ * struct enable minimal validation of the values being written to be
+ * performed, and the mode field allows minimal authentication.
+ *
+ * There must be a proc_handler routine for any terminal nodes
+ * mirrored under /proc/sys (non-terminals are handled by a built-in
+ * directory handler).  Several default handlers are available to
+ * cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), 
+ * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *__register_sysctl_paths(
+        struct ctl_table_root *root,
+        struct nsproxy *namespaces,
+        const struct ctl_path *path, struct ctl_table *table)
+{
+        struct ctl_table_header *header;
+        struct ctl_table *new, **prevp;
+        unsigned int n, npath;
+        struct ctl_table_set *set;
+        /* Count the path components */
+        for (npath = 0; path[npath].procname; ++npath)
+                ;
+        /*
+         * For each path component, allocate a 2-element ctl_table array.
+         * The first array element will be filled with the sysctl entry
+         * for this, the second will be the sentinel (procname == 0).
+         *
+         * We allocate everything in one go so that we don't have to
+         * worry about freeing additional memory in unregister_sysctl_table.
+         */
+        header = kzalloc(sizeof(struct ctl_table_header) +
+                         (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
+        if (!header)
+                return NULL;
+        new = (struct ctl_table *) (header + 1);
+        /* Now connect the dots */
+        prevp = &header->ctl_table;
+        for (n = 0; n < npath; ++n, ++path) {
+                /* Copy the procname */
+                new->procname = path->procname;
+                new->mode     = 0555;
+                *prevp = new;
+                prevp = &new->child;
+                new += 2;
+        }
+        *prevp = table;
+        header->ctl_table_arg = table;
+        INIT_LIST_HEAD(&header->ctl_entry);
+        header->used = 0;
+        header->unregistering = NULL;
+        header->root = root;
+        sysctl_set_parent(NULL, header->ctl_table);
+        header->count = 1;
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
+        if (sysctl_check_table(namespaces, header->ctl_table)) {
+                kfree(header);
+                return NULL;
+        }
+#endif
+        spin_lock(&sysctl_lock);
+        header->set = lookup_header_set(root, namespaces);
+        header->attached_by = header->ctl_table;
+        header->attached_to = root_table;
+        header->parent = &root_table_header;
+        for (set = header->set; set; set = set->parent) {
+                struct ctl_table_header *p;
+                list_for_each_entry(p, &set->list, ctl_entry) {
+                        if (p->unregistering)
+                                continue;
+                        try_attach(p, header);
+                }
+        }
+        header->parent->count++;
+        list_add_tail(&header->ctl_entry, &header->set->list);
+        spin_unlock(&sysctl_lock);
+        return header;
+}
+/**
+ * register_sysctl_table_path - register a sysctl table hierarchy
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+                                                struct ctl_table *table)
+{
+        return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
+                                        path, table);
+}
+/**
+ * register_sysctl_table - register a sysctl table hierarchy
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+{
+        static const struct ctl_path null_path[] = { {} };
+        return register_sysctl_paths(null_path, table);
+}
+/**
+ * unregister_sysctl_table - unregister a sysctl table hierarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+        might_sleep();
+        if (header == NULL)
+                return;
+        spin_lock(&sysctl_lock);
+        start_unregistering(header);
+        if (!--header->parent->count) {
+                WARN_ON(1);
+                kfree_rcu(header->parent, rcu);
+        }
+        if (!--header->count)
+                kfree_rcu(header, rcu);
+        spin_unlock(&sysctl_lock);
+}
+int sysctl_is_seen(struct ctl_table_header *p)
+{
+        struct ctl_table_set *set = p->set;
+        int res;
+        spin_lock(&sysctl_lock);
+        if (p->unregistering)
+                res = 0;
+        else if (!set->is_seen)
+                res = 1;
+        else
+                res = set->is_seen(set);
+        spin_unlock(&sysctl_lock);
+        return res;
+}
+void setup_sysctl_set(struct ctl_table_set *p,
+        struct ctl_table_set *parent,
+        int (*is_seen)(struct ctl_table_set *))
+{
+        INIT_LIST_HEAD(&p->list);
+        p->parent = parent ? parent : &sysctl_table_root.default_set;
+        p->is_seen = is_seen;
+}
+#else /* !CONFIG_SYSCTL */
+struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
+{
+        return NULL;
+}
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+                                                    struct ctl_table *table)
+{
+        return NULL;
+}
+void unregister_sysctl_table(struct ctl_table_header * table)
+{
+}
+void setup_sysctl_set(struct ctl_table_set *p,
+        struct ctl_table_set *parent,
+        int (*is_seen)(struct ctl_table_set *))
+{
+}
+void sysctl_head_put(struct ctl_table_header *head)
+{
+}
 #endif /* CONFIG_SYSCTL */
 /*
@@ -2014,7 +2419,7 @@ static int proc_taint(struct ctl_table *table, int write,
 }
 #ifdef CONFIG_PRINTK
-static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        if (write && !capable(CAP_SYS_ADMIN))
@@ -2080,38 +2485,6 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
                                do_proc_dointvec_minmax_conv, &param);
 }
-static void validate_coredump_safety(void)
-{
-#ifdef CONFIG_COREDUMP
-        if (suid_dumpable == SUID_DUMPABLE_SAFE &&
-            core_pattern[0] != '/' && core_pattern[0] != '|') {
-                printk(KERN_WARNING "Unsafe core_pattern used with "\
-                        "suid_dumpable=2. Pipe handler or fully qualified "\
-                        "core dump path required.\n");
-        }
-#endif
-}
-static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
-                void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-        if (!error)
-                validate_coredump_safety();
-        return error;
-}
-#ifdef CONFIG_COREDUMP
-static int proc_dostring_coredump(struct ctl_table *table, int write,
-                  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        int error = proc_dostring(table, write, buffer, lenp, ppos);
-        if (!error)
-                validate_coredump_safety();
-        return error;
-}
-#endif
 static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
                                     void __user *buffer,
                                     size_t *lenp, loff_t *ppos,
@@ -2499,7 +2872,9 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
                                }
                        }
-                        bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
+                        while (val_a <= val_b)
+                                set_bit(val_a++, tmp_bitmap);
                        first = 0;
                        proc_skip_char(&kbuf, &left, '\n');
                }
@@ -2542,7 +2917,8 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
                        if (*ppos)
                                bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
                        else
-                                bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
+                                memcpy(bitmap, tmp_bitmap,
+                                        BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
                }
                kfree(tmp_bitmap);
                *lenp -= left;
@@ -2620,3 +2996,6 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);
 EXPORT_SYMBOL(proc_doulongvec_minmax);
 EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+EXPORT_SYMBOL(register_sysctl_table);
+EXPORT_SYMBOL(register_sysctl_paths);
+EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 5a638445050..2ce1b308672 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = {
        { CTL_INT,      VM_DIRTY_RATIO,                 "dirty_ratio" },
        /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
        /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
-        /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
+        { CTL_INT,      VM_NR_PDFLUSH_THREADS,          "nr_pdflush_threads" },
        { CTL_INT,      VM_OVERCOMMIT_RATIO,            "overcommit_ratio" },
        /* VM_PAGEBUF unused */
        /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
@@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
        { CTL_INT,      NET_IPV4_ROUTE_GC_MIN_INTERVAL,         "gc_min_interval" },
        { CTL_INT,      NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,      "gc_min_interval_ms" },
        { CTL_INT,      NET_IPV4_ROUTE_GC_TIMEOUT,              "gc_timeout" },
-        /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */
+        { CTL_INT,      NET_IPV4_ROUTE_GC_INTERVAL,             "gc_interval" },
        { CTL_INT,      NET_IPV4_ROUTE_REDIRECT_LOAD,           "redirect_load" },
        { CTL_INT,      NET_IPV4_ROUTE_REDIRECT_NUMBER,         "redirect_number" },
        { CTL_INT,      NET_IPV4_ROUTE_REDIRECT_SILENCE,        "redirect_silence" },
@@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
                goto out_putname;
        }
-        mnt = task_active_pid_ns(current)->proc_mnt;
+        mnt = current->nsproxy->pid_ns->proc_mnt;
        file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
        result = PTR_ERR(file);
        if (IS_ERR(file))
diff --git a/kernel/task_work.c b/kernel/task_work.c
deleted file mode 100644
index 65bd3c92d6f..00000000000
--- a/kernel/task_work.c
+++ /dev/null
@@ -1,92 +0,0 @@
-#include <linux/spinlock.h>
-#include <linux/task_work.h>
-#include <linux/tracehook.h>
-static struct callback_head work_exited; /* all we need is ->next == NULL */
-int
-task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
-{
-        struct callback_head *head;
-        do {
-                head = ACCESS_ONCE(task->task_works);
-                if (unlikely(head == &work_exited))
-                        return -ESRCH;
-                work->next = head;
-        } while (cmpxchg(&task->task_works, head, work) != head);
-        if (notify)
-                set_notify_resume(task);
-        return 0;
-}
-struct callback_head *
-task_work_cancel(struct task_struct *task, task_work_func_t func)
-{
-        struct callback_head **pprev = &task->task_works;
-        struct callback_head *work = NULL;
-        unsigned long flags;
-        /*
-         * If cmpxchg() fails we continue without updating pprev.
-         * Either we raced with task_work_add() which added the
-         * new entry before this work, we will find it again. Or
-         * we raced with task_work_run(), *pprev == NULL/exited.
-         */
-        raw_spin_lock_irqsave(&task->pi_lock, flags);
-        while ((work = ACCESS_ONCE(*pprev))) {
-                read_barrier_depends();
-                if (work->func != func)
-                        pprev = &work->next;
-                else if (cmpxchg(pprev, work, work->next) == work)
-                        break;
-        }
-        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-        return work;
-}
-void task_work_run(void)
-{
-        struct task_struct *task = current;
-        struct callback_head *work, *head, *next;
-        for (;;) {
-                /*
-                 * work->func() can do task_work_add(), do not set
-                 * work_exited unless the list is empty.
-                 */
-                do {
-                        work = ACCESS_ONCE(task->task_works);
-                        head = !work && (task->flags & PF_EXITING) ?
-                                &work_exited : NULL;
-                } while (cmpxchg(&task->task_works, work, head) != work);
-                if (!work)
-                        break;
-                /*
-                 * Synchronize with task_work_cancel(). It can't remove
-                 * the first entry == work, cmpxchg(task_works) should
-                 * fail, but it can play with *work and other entries.
-                 */
-                raw_spin_unlock_wait(&task->pi_lock);
-                smp_mb();
-                /* Reverse the list to run the works in fifo order */
-                head = NULL;
-                do {
-                        next = work->next;
-                        work->next = head;
-                        head = work;
-                        work = next;
-                } while (work);
-                work = head;
-                do {
-                        next = work->next;
-                        work->func(work);
-                        work = next;
-                        cond_resched();
-                } while (work);
-        }
-}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 145bb4d3bd4..e66046456f4 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -27,7 +27,6 @@
 #include <linux/cgroup.h>
 #include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/pid_namespace.h>
 #include <net/genetlink.h>
 #include <linux/atomic.h>
@@ -175,9 +174,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
        up_write(&listeners->sem);
 }
-static void fill_stats(struct user_namespace *user_ns,
+static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
-                       struct pid_namespace *pid_ns,
-                       struct task_struct *tsk, struct taskstats *stats)
 {
        memset(stats, 0, sizeof(*stats));
        /*
@@ -193,7 +190,7 @@ static void fill_stats(struct user_namespace *user_ns,
        stats->version = TASKSTATS_VERSION;
        stats->nvcsw = tsk->nvcsw;
        stats->nivcsw = tsk->nivcsw;
-        bacct_add_tsk(user_ns, pid_ns, stats, tsk);
+        bacct_add_tsk(stats, tsk);
        /* fill in extended acct fields */
        xacct_add_tsk(stats, tsk);
@@ -210,7 +207,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
        rcu_read_unlock();
        if (!tsk)
                return -ESRCH;
-        fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats);
+        fill_stats(tsk, stats);
        put_task_struct(tsk);
        return 0;
 }
@@ -294,12 +291,6 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
        if (!cpumask_subset(mask, cpu_possible_mask))
                return -EINVAL;
-        if (current_user_ns() != &init_user_ns)
-                return -EINVAL;
-        if (task_active_pid_ns(current) != &init_pid_ns)
-                return -EINVAL;
        if (isadd == REGISTER) {
                for_each_cpu(cpu, mask) {
                        s = kmalloc_node(sizeof(struct listener),
@@ -424,15 +415,16 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        struct nlattr *na;
        size_t size;
        u32 fd;
-        struct fd f;
+        struct file *file;
+        int fput_needed;
        na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
        if (!na)
                return -EINVAL;
        fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
-        f = fdget(fd);
+        file = fget_light(fd, &fput_needed);
-        if (!f.file)
+        if (!file)
                return 0;
        size = nla_total_size(sizeof(struct cgroupstats));
@@ -444,16 +436,10 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
                                sizeof(struct cgroupstats));
-        if (na == NULL) {
-                nlmsg_free(rep_skb);
-                rc = -EMSGSIZE;
-                goto err;
-        }
        stats = nla_data(na);
        memset(stats, 0, sizeof(*stats));
-        rc = cgroupstats_build(stats, f.file->f_dentry);
+        rc = cgroupstats_build(stats, file->f_dentry);
        if (rc < 0) {
                nlmsg_free(rep_skb);
                goto err;
@@ -462,7 +448,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        rc = send_reply(rep_skb, info);
 err:
-        fdput(f);
+        fput_light(file, fput_needed);
        return rc;
 }
@@ -476,7 +462,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info)
        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
        if (rc < 0)
                goto out;
-        rc = add_del_listener(info->snd_portid, mask, REGISTER);
+        rc = add_del_listener(info->snd_pid, mask, REGISTER);
 out:
        free_cpumask_var(mask);
        return rc;
@@ -492,7 +478,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info)
        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
        if (rc < 0)
                goto out;
-        rc = add_del_listener(info->snd_portid, mask, DEREGISTER);
+        rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
 out:
        free_cpumask_var(mask);
        return rc;
@@ -640,12 +626,11 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
        if (rc < 0)
                return;
-        stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID,
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
-                         task_pid_nr_ns(tsk, &init_pid_ns));
        if (!stats)
                goto err;
-        fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);
+        fill_stats(tsk, stats);
        /*
         * Doesn't matter if tsk is the leader or the last group member leaving
@@ -653,8 +638,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
        if (!is_thread_group || !group_dead)
                goto send;
-        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID,
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
-                         task_tgid_nr_ns(tsk, &init_pid_ns));
        if (!stats)
                goto err;
diff --git a/kernel/time.c b/kernel/time.c
index d226c6a3fd2..d7760621452 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -27,10 +27,10 @@
 *      with nanosecond accuracy
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/timex.h>
 #include <linux/capability.h>
-#include <linux/timekeeper_internal.h>
+#include <linux/clocksource.h>
 #include <linux/errno.h>
 #include <linux/syscalls.h>
 #include <linux/security.h>
@@ -163,6 +163,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
                return error;
        if (tz) {
+                /* SMP safe, global irq locking makes it work. */
                sys_tz = *tz;
                update_vsyscall_tz();
                if (firsttime) {
@@ -172,7 +173,12 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
                }
        }
        if (tv)
+        {
+                /* SMP safe, again the code in arch/foo/time.c should
+                 * globally block out interrupts when it runs.
+                 */
                return do_settimeofday(tv);
+        }
        return 0;
 }
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8601f0db126..f06a8a36564 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -1,63 +1,6 @@
 #
 # Timer subsystem related configuration options
 #
-# Options selectable by arch Kconfig
-# Watchdog function for clocksources to detect instabilities
-config CLOCKSOURCE_WATCHDOG
-        bool
-# Architecture has extra clocksource data
-config ARCH_CLOCKSOURCE_DATA
-        bool
-# Timekeeping vsyscall support
-config GENERIC_TIME_VSYSCALL
-        bool
-# Timekeeping vsyscall support
-config GENERIC_TIME_VSYSCALL_OLD
-        bool
-# ktime_t scalar 64bit nsec representation
-config KTIME_SCALAR
-        bool
-# Old style timekeeping
-config ARCH_USES_GETTIMEOFFSET
-        bool
-# The generic clock events infrastructure
-config GENERIC_CLOCKEVENTS
-        bool
-# Migration helper. Builds, but does not invoke
-config GENERIC_CLOCKEVENTS_BUILD
-        bool
-        default y
-        depends on GENERIC_CLOCKEVENTS
-# Clockevents broadcasting infrastructure
-config GENERIC_CLOCKEVENTS_BROADCAST
-        bool
-        depends on GENERIC_CLOCKEVENTS
-# Automatically adjust the min. reprogramming time for
-# clock event device
-config GENERIC_CLOCKEVENTS_MIN_ADJUST
-        bool
-# Generic update of CMOS clock
-config GENERIC_CMOS_UPDATE
-        bool
-if GENERIC_CLOCKEVENTS
-menu "Timers subsystem"
-# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
-# only related to the tick functionality. Oneshot clockevent devices
-# are supported independ of this.
 config TICK_ONESHOT
        bool
@@ -79,5 +22,8 @@ config HIGH_RES_TIMERS
          hardware is not capable then this option only increases
          the size of the kernel image.
-endmenu
+config GENERIC_CLOCKEVENTS_BUILD
-endif
+        bool
+        default y
+        depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ff7d9d2ab50..cae2ad7491b 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,5 +1,5 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
-obj-y += timeconv.o posix-clock.o alarmtimer.o
+obj-y += timeconv.o posix-clock.o #alarmtimer.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)         += clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index f11d83b1294..8b70c76910a 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -37,6 +37,7 @@
 static struct alarm_base {
        spinlock_t              lock;
        struct timerqueue_head  timerqueue;
+        struct hrtimer          timer;
        ktime_t                 (*gettime)(void);
        clockid_t               base_clockid;
 } alarm_bases[ALARM_NUMTYPE];
@@ -45,8 +46,6 @@ static struct alarm_base {
 static ktime_t freezer_delta;
 static DEFINE_SPINLOCK(freezer_delta_lock);
-static struct wakeup_source *ws;
 #ifdef CONFIG_RTC_CLASS
 /* rtc timer and device for setting alarm wakeups at suspend */
 static struct rtc_timer         rtctimer;
@@ -54,112 +53,108 @@ static struct rtc_device	*rtcdev;
 static DEFINE_SPINLOCK(rtcdev_lock);
 /**
- * alarmtimer_get_rtcdev - Return selected rtcdevice
+ * has_wakealarm - check rtc device has wakealarm ability
+ * @dev: current device
+ * @name_ptr: name to be returned
 *
- * This function returns the rtc device to use for wakealarms.
+ * This helper function checks to see if the rtc device can wake
- * If one has not already been chosen, it checks to see if a
+ * from suspend.
- * functional rtc device is available.
 */
-struct rtc_device *alarmtimer_get_rtcdev(void)
+static int has_wakealarm(struct device *dev, void *name_ptr)
 {
-        unsigned long flags;
+        struct rtc_device *candidate = to_rtc_device(dev);
-        struct rtc_device *ret;
-        spin_lock_irqsave(&rtcdev_lock, flags);
+        if (!candidate->ops->set_alarm)
-        ret = rtcdev;
+                return 0;
-        spin_unlock_irqrestore(&rtcdev_lock, flags);
+        if (!device_may_wakeup(candidate->dev.parent))
+                return 0;
-        return ret;
+        *(const char **)name_ptr = dev_name(dev);
+        return 1;
 }
+/**
-static int alarmtimer_rtc_add_device(struct device *dev,
+ * alarmtimer_get_rtcdev - Return selected rtcdevice
-                                struct class_interface *class_intf)
+ *
+ * This function returns the rtc device to use for wakealarms.
+ * If one has not already been chosen, it checks to see if a
+ * functional rtc device is available.
+ */
+static struct rtc_device *alarmtimer_get_rtcdev(void)
 {
+        struct device *dev;
+        char *str;
        unsigned long flags;
-        struct rtc_device *rtc = to_rtc_device(dev);
+        struct rtc_device *ret;
-        if (rtcdev)
-                return -EBUSY;
-        if (!rtc->ops->set_alarm)
-                return -1;
-        if (!device_may_wakeup(rtc->dev.parent))
-                return -1;
        spin_lock_irqsave(&rtcdev_lock, flags);
        if (!rtcdev) {
-                rtcdev = rtc;
+                /* Find an rtc device and init the rtc_timer */
-                /* hold a reference so it doesn't go away */
+                dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
-                get_device(dev);
+                /* If we have a device then str is valid. See has_wakealarm() */
+                if (dev) {
+                        rtcdev = rtc_class_open(str);
+                        /*
+                         * Drop the reference we got in class_find_device,
+                         * rtc_open takes its own.
+                         */
+                        put_device(dev);
+                        rtc_timer_init(&rtctimer, NULL, NULL);
+                }
        }
+        ret = rtcdev;
        spin_unlock_irqrestore(&rtcdev_lock, flags);
-        return 0;
-}
-static inline void alarmtimer_rtc_timer_init(void)
-{
-        rtc_timer_init(&rtctimer, NULL, NULL);
-}
-static struct class_interface alarmtimer_rtc_interface = {
-        .add_dev = &alarmtimer_rtc_add_device,
-};
-static int alarmtimer_rtc_interface_setup(void)
+        return ret;
-{
-        alarmtimer_rtc_interface.class = rtc_class;
-        return class_interface_register(&alarmtimer_rtc_interface);
-}
-static void alarmtimer_rtc_interface_remove(void)
-{
-        class_interface_unregister(&alarmtimer_rtc_interface);
 }
 #else
-struct rtc_device *alarmtimer_get_rtcdev(void)
+#define alarmtimer_get_rtcdev() (0)
-{
+#define rtcdev (0)
-        return NULL;
-}
-#define rtcdev (NULL)
-static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
-static inline void alarmtimer_rtc_interface_remove(void) { }
-static inline void alarmtimer_rtc_timer_init(void) { }
 #endif
 /**
 * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
 * @base: pointer to the base where the timer is being run
 * @alarm: pointer to alarm being enqueued.
 *
- * Adds alarm to a alarm_base timerqueue
+ * Adds alarm to a alarm_base timerqueue and if necessary sets
+ * an hrtimer to run.
 *
 * Must hold base->lock when calling.
 */
 static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
 {
-        if (alarm->state & ALARMTIMER_STATE_ENQUEUED)
-                timerqueue_del(&base->timerqueue, &alarm->node);
        timerqueue_add(&base->timerqueue, &alarm->node);
-        alarm->state |= ALARMTIMER_STATE_ENQUEUED;
+        if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
+                hrtimer_try_to_cancel(&base->timer);
+                hrtimer_start(&base->timer, alarm->node.expires,
+                                HRTIMER_MODE_ABS);
+        }
 }
 /**
- * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue
+ * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
 * @base: pointer to the base where the timer is running
 * @alarm: pointer to alarm being removed
 *
- * Removes alarm to a alarm_base timerqueue
+ * Removes alarm to a alarm_base timerqueue and if necessary sets
+ * a new timer to run.
 *
 * Must hold base->lock when calling.
 */
-static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
+static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
 {
-        if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
+        struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
-                return;
        timerqueue_del(&base->timerqueue, &alarm->node);
-        alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
+        if (next == &alarm->node) {
+                hrtimer_try_to_cancel(&base->timer);
+                next = timerqueue_getnext(&base->timerqueue);
+                if (!next)
+                        return;
+                hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
+        }
 }
@@ -174,23 +169,39 @@ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
 */
 static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
 {
-        struct alarm *alarm = container_of(timer, struct alarm, timer);
+        struct alarm_base *base = container_of(timer, struct alarm_base, timer);
-        struct alarm_base *base = &alarm_bases[alarm->type];
+        struct timerqueue_node *next;
        unsigned long flags;
+        ktime_t now;
        int ret = HRTIMER_NORESTART;
-        int restart = ALARMTIMER_NORESTART;
        spin_lock_irqsave(&base->lock, flags);
-        alarmtimer_dequeue(base, alarm);
+        now = base->gettime();
-        spin_unlock_irqrestore(&base->lock, flags);
+        while ((next = timerqueue_getnext(&base->timerqueue))) {
+                struct alarm *alarm;
+                ktime_t expired = next->expires;
-        if (alarm->function)
+                if (expired.tv64 > now.tv64)
-                restart = alarm->function(alarm, base->gettime());
+                        break;
-        spin_lock_irqsave(&base->lock, flags);
+                alarm = container_of(next, struct alarm, node);
-        if (restart != ALARMTIMER_NORESTART) {
-                hrtimer_set_expires(&alarm->timer, alarm->node.expires);
+                timerqueue_del(&base->timerqueue, &alarm->node);
-                alarmtimer_enqueue(base, alarm);
+                alarm->enabled = 0;
+                /* Re-add periodic timers */
+                if (alarm->period.tv64) {
+                        alarm->node.expires = ktime_add(expired, alarm->period);
+                        timerqueue_add(&base->timerqueue, &alarm->node);
+                        alarm->enabled = 1;
+                }
+                spin_unlock_irqrestore(&base->lock, flags);
+                if (alarm->function)
+                        alarm->function(alarm);
+                spin_lock_irqsave(&base->lock, flags);
+        }
+        if (next) {
+                hrtimer_set_expires(&base->timer, next->expires);
                ret = HRTIMER_RESTART;
        }
        spin_unlock_irqrestore(&base->lock, flags);
@@ -217,14 +228,13 @@ static int alarmtimer_suspend(struct device *dev)
        unsigned long flags;
        struct rtc_device *rtc;
        int i;
-        int ret;
        spin_lock_irqsave(&freezer_delta_lock, flags);
        min = freezer_delta;
        freezer_delta = ktime_set(0, 0);
        spin_unlock_irqrestore(&freezer_delta_lock, flags);
-        rtc = alarmtimer_get_rtcdev();
+        rtc = rtcdev;
        /* If we have no rtcdev, just return */
        if (!rtc)
                return 0;
@@ -247,10 +257,8 @@ static int alarmtimer_suspend(struct device *dev)
        if (min.tv64 == 0)
                return 0;
-        if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
+        /* XXX - Should we enforce a minimum sleep time? */
-                __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+        WARN_ON(min.tv64 < NSEC_PER_SEC);
-                return -EBUSY;
-        }
        /* Setup an rtc timer to fire that far in the future */
        rtc_timer_cancel(rtc, &rtctimer);
@@ -258,11 +266,9 @@ static int alarmtimer_suspend(struct device *dev)
        now = rtc_tm_to_ktime(tm);
        now = ktime_add(now, min);
-        /* Set alarm, if in the past reject suspend briefly to handle */
+        rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
-        ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
-        if (ret < 0)
+        return 0;
-                __pm_wakeup_event(ws, MSEC_PER_SEC);
-        return ret;
 }
 #else
 static int alarmtimer_suspend(struct device *dev)
@@ -293,110 +299,53 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
 * @function: callback that is run when the alarm fires
 */
 void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
-                enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
+                void (*function)(struct alarm *))
 {
        timerqueue_init(&alarm->node);
-        hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
+        alarm->period = ktime_set(0, 0);
-                        HRTIMER_MODE_ABS);
-        alarm->timer.function = alarmtimer_fired;
        alarm->function = function;
        alarm->type = type;
-        alarm->state = ALARMTIMER_STATE_INACTIVE;
+        alarm->enabled = 0;
 }
 /**
 * alarm_start - Sets an alarm to fire
 * @alarm: ptr to alarm to set
 * @start: time to run the alarm
+ * @period: period at which the alarm will recur
 */
-int alarm_start(struct alarm *alarm, ktime_t start)
+void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
 {
        struct alarm_base *base = &alarm_bases[alarm->type];
        unsigned long flags;
-        int ret;
        spin_lock_irqsave(&base->lock, flags);
+        if (alarm->enabled)
+                alarmtimer_remove(base, alarm);
        alarm->node.expires = start;
+        alarm->period = period;
        alarmtimer_enqueue(base, alarm);
-        ret = hrtimer_start(&alarm->timer, alarm->node.expires,
+        alarm->enabled = 1;
-                                HRTIMER_MODE_ABS);
        spin_unlock_irqrestore(&base->lock, flags);
-        return ret;
 }
 /**
- * alarm_try_to_cancel - Tries to cancel an alarm timer
+ * alarm_cancel - Tries to cancel an alarm timer
 * @alarm: ptr to alarm to be canceled
- *
- * Returns 1 if the timer was canceled, 0 if it was not running,
- * and -1 if the callback was running
 */
-int alarm_try_to_cancel(struct alarm *alarm)
+void alarm_cancel(struct alarm *alarm)
 {
        struct alarm_base *base = &alarm_bases[alarm->type];
        unsigned long flags;
-        int ret;
        spin_lock_irqsave(&base->lock, flags);
-        ret = hrtimer_try_to_cancel(&alarm->timer);
+        if (alarm->enabled)
-        if (ret >= 0)
+                alarmtimer_remove(base, alarm);
-                alarmtimer_dequeue(base, alarm);
+        alarm->enabled = 0;
        spin_unlock_irqrestore(&base->lock, flags);
-        return ret;
-}
-/**
- * alarm_cancel - Spins trying to cancel an alarm timer until it is done
- * @alarm: ptr to alarm to be canceled
- *
- * Returns 1 if the timer was canceled, 0 if it was not active.
- */
-int alarm_cancel(struct alarm *alarm)
-{
-        for (;;) {
-                int ret = alarm_try_to_cancel(alarm);
-                if (ret >= 0)
-                        return ret;
-                cpu_relax();
-        }
-}
-u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
-{
-        u64 overrun = 1;
-        ktime_t delta;
-        delta = ktime_sub(now, alarm->node.expires);
-        if (delta.tv64 < 0)
-                return 0;
-        if (unlikely(delta.tv64 >= interval.tv64)) {
-                s64 incr = ktime_to_ns(interval);
-                overrun = ktime_divns(delta, incr);
-                alarm->node.expires = ktime_add_ns(alarm->node.expires,
-                                                        incr*overrun);
-                if (alarm->node.expires.tv64 > now.tv64)
-                        return overrun;
-                /*
-                 * This (and the ktime_add() below) is the
-                 * correction for exact:
-                 */
-                overrun++;
-        }
-        alarm->node.expires = ktime_add(alarm->node.expires, interval);
-        return overrun;
 }
 /**
 * clock2alarm - helper that converts from clockid to alarmtypes
 * @clockid: clockid.
@@ -416,21 +365,12 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
 *
 * Posix timer callback for expired alarm timers.
 */
-static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
+static void alarm_handle_timer(struct alarm *alarm)
-                                                        ktime_t now)
 {
        struct k_itimer *ptr = container_of(alarm, struct k_itimer,
-                                                it.alarm.alarmtimer);
+                                                it.alarmtimer);
        if (posix_timer_event(ptr, 0) != 0)
                ptr->it_overrun++;
-        /* Re-add periodic timers */
-        if (ptr->it.alarm.interval.tv64) {
-                ptr->it_overrun += alarm_forward(alarm, now,
-                                                ptr->it.alarm.interval);
-                return ALARMTIMER_RESTART;
-        }
-        return ALARMTIMER_NORESTART;
 }
 /**
@@ -487,7 +427,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)
        type = clock2alarm(new_timer->it_clock);
        base = &alarm_bases[type];
-        alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
+        alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer);
        return 0;
 }
@@ -504,9 +444,9 @@ static void alarm_timer_get(struct k_itimer *timr,
        memset(cur_setting, 0, sizeof(struct itimerspec));
        cur_setting->it_interval =
-                        ktime_to_timespec(timr->it.alarm.interval);
+                        ktime_to_timespec(timr->it.alarmtimer.period);
        cur_setting->it_value =
-                ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires);
+                        ktime_to_timespec(timr->it.alarmtimer.node.expires);
        return;
 }
@@ -521,9 +461,7 @@ static int alarm_timer_del(struct k_itimer *timr)
        if (!rtcdev)
                return -ENOTSUPP;
-        if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
+        alarm_cancel(&timr->it.alarmtimer);
-                return TIMER_RETRY;
        return 0;
 }
@@ -543,17 +481,25 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
        if (!rtcdev)
                return -ENOTSUPP;
+        /*
+         * XXX HACK! Currently we can DOS a system if the interval
+         * period on alarmtimers is too small. Cap the interval here
+         * to 100us and solve this properly in a future patch! -jstultz
+         */
+        if ((new_setting->it_interval.tv_sec == 0) &&
+                        (new_setting->it_interval.tv_nsec < 100000))
+                new_setting->it_interval.tv_nsec = 100000;
        if (old_setting)
                alarm_timer_get(timr, old_setting);
        /* If the timer was already set, cancel it */
-        if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
+        alarm_cancel(&timr->it.alarmtimer);
-                return TIMER_RETRY;
        /* start the timer */
-        timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
+        alarm_start(&timr->it.alarmtimer,
-        alarm_start(&timr->it.alarm.alarmtimer,
+                        timespec_to_ktime(new_setting->it_value),
-                        timespec_to_ktime(new_setting->it_value));
+                        timespec_to_ktime(new_setting->it_interval));
        return 0;
 }
@@ -563,15 +509,13 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
 *
 * Wakes up the task that set the alarmtimer
 */
-static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
+static void alarmtimer_nsleep_wakeup(struct alarm *alarm)
-                                                                ktime_t now)
 {
        struct task_struct *task = (struct task_struct *)alarm->data;
        alarm->data = NULL;
        if (task)
                wake_up_process(task);
-        return ALARMTIMER_NORESTART;
 }
 /**
@@ -586,7 +530,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
        alarm->data = (void *)current;
        do {
                set_current_state(TASK_INTERRUPTIBLE);
-                alarm_start(alarm, absexp);
+                alarm_start(alarm, absexp, ktime_set(0, 0));
                if (likely(alarm->data))
                        schedule();
@@ -747,7 +691,6 @@ static struct platform_driver alarmtimer_driver = {
 */
 static int __init alarmtimer_init(void)
 {
-        struct platform_device *pdev;
        int error = 0;
        int i;
        struct k_clock alarm_clock = {
@@ -760,8 +703,6 @@ static int __init alarmtimer_init(void)
                .nsleep         = alarm_timer_nsleep,
        };
-        alarmtimer_rtc_timer_init();
        posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
        posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
@@ -773,28 +714,15 @@ static int __init alarmtimer_init(void)
        for (i = 0; i < ALARM_NUMTYPE; i++) {
                timerqueue_init_head(&alarm_bases[i].timerqueue);
                spin_lock_init(&alarm_bases[i].lock);
+                hrtimer_init(&alarm_bases[i].timer,
+                                alarm_bases[i].base_clockid,
+                                HRTIMER_MODE_ABS);
+                alarm_bases[i].timer.function = alarmtimer_fired;
        }
-        error = alarmtimer_rtc_interface_setup();
-        if (error)
-                return error;
        error = platform_driver_register(&alarmtimer_driver);
-        if (error)
+        platform_device_register_simple("alarmtimer", -1, NULL, 0);
-                goto out_if;
-        pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
-        if (IS_ERR(pdev)) {
-                error = PTR_ERR(pdev);
-                goto out_drv;
-        }
-        ws = wakeup_source_register("alarmtimer");
-        return 0;
-out_drv:
-        platform_driver_unregister(&alarmtimer_driver);
-out_if:
-        alarmtimer_rtc_interface_remove();
        return error;
 }
 device_initcall(alarmtimer_init);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 30b6de0d977..e4c699dfa4e 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
+#include <linux/sysdev.h>
 #include "tick-internal.h"
@@ -93,143 +94,42 @@ void clockevents_shutdown(struct clock_event_device *dev)
        dev->next_event.tv64 = KTIME_MAX;
 }
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
-/* Limit min_delta to a jiffie */
-#define MIN_DELTA_LIMIT         (NSEC_PER_SEC / HZ)
-/**
- * clockevents_increase_min_delta - raise minimum delta of a clock event device
- * @dev:       device to increase the minimum delta
- *
- * Returns 0 on success, -ETIME when the minimum delta reached the limit.
- */
-static int clockevents_increase_min_delta(struct clock_event_device *dev)
-{
-        /* Nothing to do if we already reached the limit */
-        if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
-                printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
-                dev->next_event.tv64 = KTIME_MAX;
-                return -ETIME;
-        }
-        if (dev->min_delta_ns < 5000)
-                dev->min_delta_ns = 5000;
-        else
-                dev->min_delta_ns += dev->min_delta_ns >> 1;
-        if (dev->min_delta_ns > MIN_DELTA_LIMIT)
-                dev->min_delta_ns = MIN_DELTA_LIMIT;
-        printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
-               dev->name ? dev->name : "?",
-               (unsigned long long) dev->min_delta_ns);
-        return 0;
-}
-/**
- * clockevents_program_min_delta - Set clock event device to the minimum delay.
- * @dev:        device to program
- *
- * Returns 0 on success, -ETIME when the retry loop failed.
- */
-static int clockevents_program_min_delta(struct clock_event_device *dev)
-{
-        unsigned long long clc;
-        int64_t delta;
-        int i;
-        for (i = 0;;) {
-                delta = dev->min_delta_ns;
-                dev->next_event = ktime_add_ns(ktime_get(), delta);
-                if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
-                        return 0;
-                dev->retries++;
-                clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
-                if (dev->set_next_event((unsigned long) clc, dev) == 0)
-                        return 0;
-                if (++i > 2) {
-                        /*
-                         * We tried 3 times to program the device with the
-                         * given min_delta_ns. Try to increase the minimum
-                         * delta, if that fails as well get out of here.
-                         */
-                        if (clockevents_increase_min_delta(dev))
-                                return -ETIME;
-                        i = 0;
-                }
-        }
-}
-#else  /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
-/**
- * clockevents_program_min_delta - Set clock event device to the minimum delay.
- * @dev:        device to program
- *
- * Returns 0 on success, -ETIME when the retry loop failed.
- */
-static int clockevents_program_min_delta(struct clock_event_device *dev)
-{
-        unsigned long long clc;
-        int64_t delta;
-        delta = dev->min_delta_ns;
-        dev->next_event = ktime_add_ns(ktime_get(), delta);
-        if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
-                return 0;
-        dev->retries++;
-        clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
-        return dev->set_next_event((unsigned long) clc, dev);
-}
-#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
 /**
 * clockevents_program_event - Reprogram the clock event device.
- * @dev:        device to program
 * @expires:    absolute expiry time (monotonic clock)
- * @force:      program minimum delay if expires can not be set
 *
 * Returns 0 on success, -ETIME when the event is in the past.
 */
 int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
-                              bool force)
+                              ktime_t now)
 {
        unsigned long long clc;
        int64_t delta;
-        int rc;
        if (unlikely(expires.tv64 < 0)) {
                WARN_ON_ONCE(1);
                return -ETIME;
        }
+        delta = ktime_to_ns(ktime_sub(expires, now));
+        if (delta <= 0)
+                return -ETIME;
        dev->next_event = expires;
        if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
                return 0;
-        /* Shortcut for clockevent devices that can deal with ktime. */
+        if (delta > dev->max_delta_ns)
-        if (dev->features & CLOCK_EVT_FEAT_KTIME)
+                delta = dev->max_delta_ns;
-                return dev->set_next_ktime(expires, dev);
+        if (delta < dev->min_delta_ns)
+                delta = dev->min_delta_ns;
-        delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
-        if (delta <= 0)
-                return force ? clockevents_program_min_delta(dev) : -ETIME;
-        delta = min(delta, (int64_t) dev->max_delta_ns);
-        delta = max(delta, (int64_t) dev->min_delta_ns);
-        clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+        clc = delta * dev->mult;
-        rc = dev->set_next_event((unsigned long) clc, dev);
+        clc >>= dev->shift;
-        return (rc && force) ? clockevents_program_min_delta(dev) : rc;
+        return dev->set_next_event((unsigned long) clc, dev);
 }
 /**
@@ -297,7 +197,8 @@ void clockevents_register_device(struct clock_event_device *dev)
 }
 EXPORT_SYMBOL_GPL(clockevents_register_device);
-void clockevents_config(struct clock_event_device *dev, u32 freq)
+static void clockevents_config(struct clock_event_device *dev,
+                               u32 freq)
 {
        u64 sec;
@@ -357,7 +258,7 @@ int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
        if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
                return 0;
-        return clockevents_program_event(dev, dev->next_event, false);
+        return clockevents_program_event(dev, dev->next_event, ktime_get());
 }
 /*
@@ -397,30 +298,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
        local_irq_restore(flags);
 }
-/**
- * clockevents_suspend - suspend clock devices
- */
-void clockevents_suspend(void)
-{
-        struct clock_event_device *dev;
-        list_for_each_entry_reverse(dev, &clockevent_devices, list)
-                if (dev->suspend)
-                        dev->suspend(dev);
-}
-/**
- * clockevents_resume - resume clock devices
- */
-void clockevents_resume(void)
-{
-        struct clock_event_device *dev;
-        list_for_each_entry(dev, &clockevent_devices, list)
-                if (dev->resume)
-                        dev->resume(dev);
-}
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 /**
 * clockevents_notify - notification about relevant events
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c9583382141..8f77da18fef 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,8 +23,8 @@
 *   o Allow clocksource drivers to be unregistered
 */
-#include <linux/device.h>
 #include <linux/clocksource.h>
+#include <linux/sysdev.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
@@ -186,7 +186,6 @@ static struct timer_list watchdog_timer;
 static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
 static DEFINE_SPINLOCK(watchdog_lock);
 static int watchdog_running;
-static atomic_t watchdog_reset_pending;
 static int clocksource_watchdog_kthread(void *data);
 static void __clocksource_change_rating(struct clocksource *cs, int rating);
@@ -248,14 +247,12 @@ static void clocksource_watchdog(unsigned long data)
        struct clocksource *cs;
        cycle_t csnow, wdnow;
        int64_t wd_nsec, cs_nsec;
-        int next_cpu, reset_pending;
+        int next_cpu;
        spin_lock(&watchdog_lock);
        if (!watchdog_running)
                goto out;
-        reset_pending = atomic_read(&watchdog_reset_pending);
        list_for_each_entry(cs, &watchdog_list, wd_list) {
                /* Clocksource already marked unstable? */
@@ -271,8 +268,7 @@ static void clocksource_watchdog(unsigned long data)
                local_irq_enable();
                /* Clocksource initialized ? */
-                if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
+                if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
-                    atomic_read(&watchdog_reset_pending)) {
                        cs->flags |= CLOCK_SOURCE_WATCHDOG;
                        cs->wd_last = wdnow;
                        cs->cs_last = csnow;
@@ -287,11 +283,8 @@ static void clocksource_watchdog(unsigned long data)
                cs->cs_last = csnow;
                cs->wd_last = wdnow;
-                if (atomic_read(&watchdog_reset_pending))
-                        continue;
                /* Check the deviation from the watchdog clocksource. */
-                if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
+                if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
                        clocksource_unstable(cs, cs_nsec - wd_nsec);
                        continue;
                }
@@ -310,13 +303,6 @@ static void clocksource_watchdog(unsigned long data)
        }
        /*
-         * We only clear the watchdog_reset_pending, when we did a
-         * full cycle through all clocksources.
-         */
-        if (reset_pending)
-                atomic_dec(&watchdog_reset_pending);
-        /*
         * Cycle through CPUs to check if the CPUs stay synchronized
         * to each other.
         */
@@ -358,7 +344,23 @@ static inline void clocksource_reset_watchdog(void)
 static void clocksource_resume_watchdog(void)
 {
-        atomic_inc(&watchdog_reset_pending);
+        unsigned long flags;
+        /*
+         * We use trylock here to avoid a potential dead lock when
+         * kgdb calls this code after the kernel has been stopped with
+         * watchdog_lock held. When watchdog_lock is held we just
+         * return and accept, that the watchdog might trigger and mark
+         * the monitored clock source (usually TSC) unstable.
+         *
+         * This does not affect the other caller clocksource_resume()
+         * because at this point the kernel is UP, interrupts are
+         * disabled and nothing can hold watchdog_lock.
+         */
+        if (!spin_trylock_irqsave(&watchdog_lock, flags))
+                return;
+        clocksource_reset_watchdog();
+        spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 static void clocksource_enqueue_watchdog(struct clocksource *cs)
@@ -500,7 +502,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
 {
        u64 ret;
        /*
-         * We won't try to correct for more than 11% adjustments (110,000 ppm),
+         * We won't try to correct for more then 11% adjustments (110,000 ppm),
         */
        ret = (u64)cs->mult * 11;
        do_div(ret,100);
@@ -647,7 +649,7 @@ static void clocksource_enqueue(struct clocksource *cs)
 /**
 * __clocksource_updatefreq_scale - Used update clocksource with new freq
- * @cs:         clocksource to be registered
+ * @t:          clocksource to be registered
 * @scale:      Scale factor multiplied against freq to get clocksource hz
 * @freq:       clocksource frequency (cycles per second) divided by scale
 *
@@ -699,7 +701,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
 /**
 * __clocksource_register_scale - Used to install new clocksources
- * @cs:         clocksource to be registered
+ * @t:          clocksource to be registered
 * @scale:      Scale factor multiplied against freq to get clocksource hz
 * @freq:       clocksource frequency (cycles per second) divided by scale
 *
@@ -727,7 +729,7 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale);
 /**
 * clocksource_register - Used to install new clocksources
- * @cs:         clocksource to be registered
+ * @t:          clocksource to be registered
 *
 * Returns -EBUSY if registration fails, zero otherwise.
 */
@@ -761,8 +763,6 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
 /**
 * clocksource_change_rating - Change the rating of a registered clocksource
- * @cs:         clocksource to be changed
- * @rating:     new rating
 */
 void clocksource_change_rating(struct clocksource *cs, int rating)
 {
@@ -774,7 +774,6 @@ EXPORT_SYMBOL(clocksource_change_rating);
 /**
 * clocksource_unregister - remove a registered clocksource
- * @cs: clocksource to be unregistered
 */
 void clocksource_unregister(struct clocksource *cs)
 {
@@ -790,14 +789,13 @@ EXPORT_SYMBOL(clocksource_unregister);
 /**
 * sysfs_show_current_clocksources - sysfs interface for current clocksource
 * @dev:        unused
- * @attr:       unused
 * @buf:        char buffer to be filled with clocksource list
 *
 * Provides sysfs interface for listing current clocksource.
 */
 static ssize_t
-sysfs_show_current_clocksources(struct device *dev,
+sysfs_show_current_clocksources(struct sys_device *dev,
-                                struct device_attribute *attr, char *buf)
+                                struct sysdev_attribute *attr, char *buf)
 {
        ssize_t count = 0;
@@ -811,15 +809,14 @@ sysfs_show_current_clocksources(struct device *dev,
 /**
 * sysfs_override_clocksource - interface for manually overriding clocksource
 * @dev:        unused
- * @attr:       unused
 * @buf:        name of override clocksource
 * @count:      length of buffer
 *
 * Takes input from sysfs interface for manually overriding the default
 * clocksource selection.
 */
-static ssize_t sysfs_override_clocksource(struct device *dev,
+static ssize_t sysfs_override_clocksource(struct sys_device *dev,
-                                          struct device_attribute *attr,
+                                          struct sysdev_attribute *attr,
                                          const char *buf, size_t count)
 {
        size_t ret = count;
@@ -847,14 +844,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
 /**
 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
 * @dev:        unused
- * @attr:       unused
 * @buf:        char buffer to be filled with clocksource list
 *
 * Provides sysfs interface for listing registered clocksources
 */
 static ssize_t
-sysfs_show_available_clocksources(struct device *dev,
+sysfs_show_available_clocksources(struct sys_device *dev,
-                                  struct device_attribute *attr,
+                                  struct sysdev_attribute *attr,
                                  char *buf)
 {
        struct clocksource *src;
@@ -883,36 +879,35 @@ sysfs_show_available_clocksources(struct device *dev,
 /*
 * Sysfs setup bits:
 */
-static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
+static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
                   sysfs_override_clocksource);
-static DEVICE_ATTR(available_clocksource, 0444,
+static SYSDEV_ATTR(available_clocksource, 0444,
                   sysfs_show_available_clocksources, NULL);
-static struct bus_type clocksource_subsys = {
+static struct sysdev_class clocksource_sysclass = {
        .name = "clocksource",
-        .dev_name = "clocksource",
 };
-static struct device device_clocksource = {
+static struct sys_device device_clocksource = {
        .id     = 0,
-        .bus    = &clocksource_subsys,
+        .cls    = &clocksource_sysclass,
 };
 static int __init init_clocksource_sysfs(void)
 {
-        int error = subsys_system_register(&clocksource_subsys, NULL);
+        int error = sysdev_class_register(&clocksource_sysclass);
        if (!error)
-                error = device_register(&device_clocksource);
+                error = sysdev_register(&device_clocksource);
        if (!error)
-                error = device_create_file(
+                error = sysdev_create_file(
                                &device_clocksource,
-                                &dev_attr_current_clocksource);
+                                &attr_current_clocksource);
        if (!error)
-                error = device_create_file(
+                error = sysdev_create_file(
                                &device_clocksource,
-                                &dev_attr_available_clocksource);
+                                &attr_available_clocksource);
        return error;
 }
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 7a925ba456f..a470154e040 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
 * requested HZ value. It is also not recommended
 * for "tick-less" systems.
 */
-#define NSEC_PER_JIFFY  ((NSEC_PER_SEC+HZ/2)/HZ)
+#define NSEC_PER_JIFFY  ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
 /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
 * conversion, the .shift value could be zero. However
@@ -58,7 +58,7 @@ static cycle_t jiffies_read(struct clocksource *cs)
        return (cycle_t) jiffies;
 }
-static struct clocksource clocksource_jiffies = {
+struct clocksource clocksource_jiffies = {
        .name           = "jiffies",
        .rating         = 1, /* lowest valid rating*/
        .read           = jiffies_read,
@@ -67,8 +67,6 @@ static struct clocksource clocksource_jiffies = {
        .shift          = JIFFIES_SHIFT,
 };
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
@@ -76,9 +74,9 @@ u64 get_jiffies_64(void)
        u64 ret;
        do {
-                seq = read_seqbegin(&jiffies_lock);
+                seq = read_seqbegin(&xtime_lock);
                ret = jiffies_64;
-        } while (read_seqretry(&jiffies_lock, seq));
+        } while (read_seqretry(&xtime_lock, seq));
        return ret;
 }
 EXPORT_SYMBOL(get_jiffies_64);
@@ -97,33 +95,3 @@ struct clocksource * __init __weak clocksource_default_clock(void)
 {
        return &clocksource_jiffies;
 }
-struct clocksource refined_jiffies;
-int register_refined_jiffies(long cycles_per_second)
-{
-        u64 nsec_per_tick, shift_hz;
-        long cycles_per_tick;
-        refined_jiffies = clocksource_jiffies;
-        refined_jiffies.name = "refined-jiffies";
-        refined_jiffies.rating++;
-        /* Calc cycles per tick */
-        cycles_per_tick = (cycles_per_second + HZ/2)/HZ;
-        /* shift_hz stores hz<<8 for extra accuracy */
-        shift_hz = (u64)cycles_per_second << 8;
-        shift_hz += cycles_per_tick/2;
-        do_div(shift_hz, cycles_per_tick);
-        /* Calculate nsec_per_tick using shift_hz */
-        nsec_per_tick = (u64)NSEC_PER_SEC << 8;
-        nsec_per_tick += (u32)shift_hz/2;
-        do_div(nsec_per_tick, (u32)shift_hz);
-        refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
-        clocksource_register(&refined_jiffies);
-        return 0;
-}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 24174b4d669..f6117a4c7cb 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -22,18 +22,17 @@
 * NTP timekeeping variables:
 */
-DEFINE_SPINLOCK(ntp_lock);
 /* USER_HZ period (usecs): */
 unsigned long                   tick_usec = TICK_USEC;
-/* SHIFTED_HZ period (nsecs): */
+/* ACTHZ period (nsecs): */
 unsigned long                   tick_nsec;
-static u64                      tick_length;
+u64                             tick_length;
 static u64                      tick_length_base;
+static struct hrtimer           leap_timer;
 #define MAX_TICKADJ             500LL           /* usecs */
 #define MAX_TICKADJ_SCALED \
        (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -50,7 +49,7 @@ static u64			tick_length_base;
 static int                      time_state = TIME_OK;
 /* clock status bits:                                                   */
-static int                      time_status = STA_UNSYNC;
+int                             time_status = STA_UNSYNC;
 /* TAI offset (secs):                                                   */
 static long                     time_tai;
@@ -134,7 +133,7 @@ static inline void pps_reset_freq_interval(void)
 /**
 * pps_clear - Clears the PPS state variables
 *
- * Must be called while holding a write on the ntp_lock
+ * Must be called while holding a write on the xtime_lock
 */
 static inline void pps_clear(void)
 {
@@ -150,7 +149,7 @@ static inline void pps_clear(void)
 * the last PPS signal. When it reaches 0, indicate that PPS signal is
 * missing.
 *
- * Must be called while holding a write on the ntp_lock
+ * Must be called while holding a write on the xtime_lock
 */
 static inline void pps_dec_valid(void)
 {
@@ -234,17 +233,6 @@ static inline void pps_fill_timex(struct timex *txc)
 #endif /* CONFIG_NTP_PPS */
-/**
- * ntp_synced - Returns 1 if the NTP status is not UNSYNC
- *
- */
-static inline int ntp_synced(void)
-{
-        return !(time_status & STA_UNSYNC);
-}
 /*
 * NTP methods:
 */
@@ -287,7 +275,7 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
        time_status |= STA_MODE;
-        return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
+        return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
 }
 static void ntp_update_offset(long offset)
@@ -342,13 +330,11 @@ static void ntp_update_offset(long offset)
 /**
 * ntp_clear - Clears the NTP state variables
+ *
+ * Must be called while holding a write on the xtime_lock
 */
 void ntp_clear(void)
 {
-        unsigned long flags;
-        spin_lock_irqsave(&ntp_lock, flags);
        time_adjust     = 0;            /* stop active adjtime() */
        time_status     |= STA_UNSYNC;
        time_maxerror   = NTP_PHASE_LIMIT;
@@ -361,85 +347,63 @@ void ntp_clear(void)
        /* Clear PPS state variables */
        pps_clear();
-        spin_unlock_irqrestore(&ntp_lock, flags);
-}
-u64 ntp_tick_length(void)
-{
-        unsigned long flags;
-        s64 ret;
-        spin_lock_irqsave(&ntp_lock, flags);
-        ret = tick_length;
-        spin_unlock_irqrestore(&ntp_lock, flags);
-        return ret;
 }
 /*
- * this routine handles the overflow of the microsecond field
+ * Leap second processing. If in leap-insert state at the end of the
- *
+ * day, the system clock is set back one second; if in leap-delete
- * The tricky bits of code to handle the accurate clock support
+ * state, the system clock is set ahead one second.
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
- *
- * Also handles leap second processing, and returns leap offset
 */
-int second_overflow(unsigned long secs)
+static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
 {
-        s64 delta;
+        enum hrtimer_restart res = HRTIMER_NORESTART;
-        int leap = 0;
-        unsigned long flags;
-        spin_lock_irqsave(&ntp_lock, flags);
+        write_seqlock(&xtime_lock);
-        /*
-         * Leap second processing. If in leap-insert state at the end of the
-         * day, the system clock is set back one second; if in leap-delete
-         * state, the system clock is set ahead one second.
-         */
        switch (time_state) {
        case TIME_OK:
-                if (time_status & STA_INS)
-                        time_state = TIME_INS;
-                else if (time_status & STA_DEL)
-                        time_state = TIME_DEL;
                break;
        case TIME_INS:
-                if (!(time_status & STA_INS))
+                timekeeping_leap_insert(-1);
-                        time_state = TIME_OK;
+                time_state = TIME_OOP;
-                else if (secs % 86400 == 0) {
+                printk(KERN_NOTICE
-                        leap = -1;
+                        "Clock: inserting leap second 23:59:60 UTC\n");
-                        time_state = TIME_OOP;
+                hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
-                        time_tai++;
+                res = HRTIMER_RESTART;
-                        printk(KERN_NOTICE
-                                "Clock: inserting leap second 23:59:60 UTC\n");
-                }
                break;
        case TIME_DEL:
-                if (!(time_status & STA_DEL))
+                timekeeping_leap_insert(1);
-                        time_state = TIME_OK;
+                time_tai--;
-                else if ((secs + 1) % 86400 == 0) {
+                time_state = TIME_WAIT;
-                        leap = 1;
+                printk(KERN_NOTICE
-                        time_tai--;
+                        "Clock: deleting leap second 23:59:59 UTC\n");
-                        time_state = TIME_WAIT;
-                        printk(KERN_NOTICE
-                                "Clock: deleting leap second 23:59:59 UTC\n");
-                }
                break;
        case TIME_OOP:
+                time_tai++;
                time_state = TIME_WAIT;
-                break;
+                /* fall through */
        case TIME_WAIT:
                if (!(time_status & (STA_INS | STA_DEL)))
                        time_state = TIME_OK;
                break;
        }
+        write_sequnlock(&xtime_lock);
+        return res;
+}
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ */
+void second_overflow(void)
+{
+        s64 delta;
        /* Bump the maxerror field */
        time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -459,32 +423,30 @@ int second_overflow(unsigned long secs)
        pps_dec_valid();
        if (!time_adjust)
-                goto out;
+                return;
        if (time_adjust > MAX_TICKADJ) {
                time_adjust -= MAX_TICKADJ;
                tick_length += MAX_TICKADJ_SCALED;
-                goto out;
+                return;
        }
        if (time_adjust < -MAX_TICKADJ) {
                time_adjust += MAX_TICKADJ;
                tick_length -= MAX_TICKADJ_SCALED;
-                goto out;
+                return;
        }
        tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
                                                         << NTP_SCALE_SHIFT;
        time_adjust = 0;
-out:
-        spin_unlock_irqrestore(&ntp_lock, flags);
-        return leap;
 }
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
+/* Disable the cmos update - used by virtualization and embedded */
+int no_sync_cmos_clock  __read_mostly;
 static void sync_cmos_clock(struct work_struct *work);
 static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -531,13 +493,35 @@ static void sync_cmos_clock(struct work_struct *work)
 static void notify_cmos_timer(void)
 {
-        schedule_delayed_work(&sync_cmos_work, 0);
+        if (!no_sync_cmos_clock)
+                schedule_delayed_work(&sync_cmos_work, 0);
 }
 #else
 static inline void notify_cmos_timer(void) { }
 #endif
+/*
+ * Start the leap seconds timer:
+ */
+static inline void ntp_start_leap_timer(struct timespec *ts)
+{
+        long now = ts->tv_sec;
+        if (time_status & STA_INS) {
+                time_state = TIME_INS;
+                now += 86400 - now % 86400;
+                hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
+                return;
+        }
+        if (time_status & STA_DEL) {
+                time_state = TIME_DEL;
+                now += 86400 - (now + 1) % 86400;
+                hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
+        }
+}
 /*
 * Propagate a new txc->status value into the NTP state:
@@ -561,10 +545,26 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
        /* only set allowed bits */
        time_status &= STA_RONLY;
        time_status |= txc->status & ~STA_RONLY;
-}
+        switch (time_state) {
+        case TIME_OK:
+                ntp_start_leap_timer(ts);
+                break;
+        case TIME_INS:
+        case TIME_DEL:
+                time_state = TIME_OK;
+                ntp_start_leap_timer(ts);
+        case TIME_WAIT:
+                if (!(time_status & (STA_INS | STA_DEL)))
+                        time_state = TIME_OK;
+                break;
+        case TIME_OOP:
+                hrtimer_restart(&leap_timer);
+                break;
+        }
+}
 /*
- * Called with ntp_lock held, so we can access and modify
+ * Called with the xtime lock held, so we can access and modify
 * all the global NTP state:
 */
 static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
@@ -643,6 +643,9 @@ int do_adjtimex(struct timex *txc)
                    (txc->tick <  900000/USER_HZ ||
                     txc->tick > 1100000/USER_HZ))
                        return -EINVAL;
+                if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
+                        hrtimer_cancel(&leap_timer);
        }
        if (txc->modes & ADJ_SETOFFSET) {
@@ -660,7 +663,7 @@ int do_adjtimex(struct timex *txc)
        getnstimeofday(&ts);
-        spin_lock_irq(&ntp_lock);
+        write_seqlock_irq(&xtime_lock);
        if (txc->modes & ADJ_ADJTIME) {
                long save_adjust = time_adjust;
@@ -702,7 +705,7 @@ int do_adjtimex(struct timex *txc)
        /* fill PPS status fields */
        pps_fill_timex(txc);
-        spin_unlock_irq(&ntp_lock);
+        write_sequnlock_irq(&xtime_lock);
        txc->time.tv_sec = ts.tv_sec;
        txc->time.tv_usec = ts.tv_nsec;
@@ -900,7 +903,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
        pts_norm = pps_normalize_ts(*phase_ts);
-        spin_lock_irqsave(&ntp_lock, flags);
+        write_seqlock_irqsave(&xtime_lock, flags);
        /* clear the error bits, they will be set again if needed */
        time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -913,7 +916,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
         * just start the frequency interval */
        if (unlikely(pps_fbase.tv_sec == 0)) {
                pps_fbase = *raw_ts;
-                spin_unlock_irqrestore(&ntp_lock, flags);
+                write_sequnlock_irqrestore(&xtime_lock, flags);
                return;
        }
@@ -928,7 +931,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
                time_status |= STA_PPSJITTER;
                /* restart the frequency calibration interval */
                pps_fbase = *raw_ts;
-                spin_unlock_irqrestore(&ntp_lock, flags);
+                write_sequnlock_irqrestore(&xtime_lock, flags);
                pr_err("hardpps: PPSJITTER: bad pulse\n");
                return;
        }
@@ -945,7 +948,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
        hardpps_update_phase(pts_norm.nsec);
-        spin_unlock_irqrestore(&ntp_lock, flags);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
 }
 EXPORT_SYMBOL(hardpps);
@@ -964,4 +967,6 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
 void __init ntp_init(void)
 {
        ntp_clear();
+        hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+        leap_timer.function = ntp_leap_second;
 }
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index ce033c7aa2e..c340ca658f3 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -18,7 +18,6 @@
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 #include <linux/device.h>
-#include <linux/export.h>
 #include <linux/file.h>
 #include <linux/posix-clock.h>
 #include <linux/slab.h>
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f113755695e..7a90d021b79 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -194,7 +194,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
        for (next = dev->next_event; ;) {
                next = ktime_add(next, tick_period);
-                if (!clockevents_program_event(dev, next, false))
+                if (!clockevents_program_event(dev, next, ktime_get()))
                        return;
                tick_do_periodic_broadcast();
        }
@@ -346,8 +346,7 @@ int tick_resume_broadcast(void)
                                                     tick_get_broadcast_mask());
                        break;
                case TICKDEV_MODE_ONESHOT:
-                        if (!cpumask_empty(tick_get_broadcast_mask()))
+                        broadcast = tick_resume_broadcast_oneshot(bc);
-                                broadcast = tick_resume_broadcast_oneshot(bc);
                        break;
                }
        }
@@ -374,10 +373,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
 {
        struct clock_event_device *bc = tick_broadcast_device.evtdev;
-        if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
+        return tick_dev_program_event(bc, expires, force);
-                clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
-        return clockevents_program_event(bc, expires, force);
 }
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -535,6 +531,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
                int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
                bc->event_handler = tick_handle_oneshot_broadcast;
+                clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
                /* Take the do_timer update */
                tick_do_timer_cpu = cpu;
@@ -552,7 +549,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
                           to_cpumask(tmpmask));
                if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
-                        clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
                        tick_broadcast_init_next_event(to_cpumask(tmpmask),
                                                       tick_next_period);
                        tick_broadcast_set_event(tick_next_period, 1);
@@ -584,7 +580,6 @@ void tick_broadcast_switch_to_oneshot(void)
        bc = tick_broadcast_device.evtdev;
        if (bc)
                tick_broadcast_setup_oneshot(bc);
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b1600a6973f..119528de823 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -63,13 +63,13 @@ int tick_is_oneshot_available(void)
 static void tick_periodic(int cpu)
 {
        if (tick_do_timer_cpu == cpu) {
-                write_seqlock(&jiffies_lock);
+                write_seqlock(&xtime_lock);
                /* Keep track of the next tick event */
                tick_next_period = ktime_add(tick_next_period, tick_period);
                do_timer(1);
-                write_sequnlock(&jiffies_lock);
+                write_sequnlock(&xtime_lock);
        }
        update_process_times(user_mode(get_irq_regs()));
@@ -94,7 +94,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
         */
        next = ktime_add(dev->next_event, tick_period);
        for (;;) {
-                if (!clockevents_program_event(dev, next, false))
+                if (!clockevents_program_event(dev, next, ktime_get()))
                        return;
                /*
                 * Have to be careful here. If we're in oneshot mode,
@@ -130,14 +130,14 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
                ktime_t next;
                do {
-                        seq = read_seqbegin(&jiffies_lock);
+                        seq = read_seqbegin(&xtime_lock);
                        next = tick_next_period;
-                } while (read_seqretry(&jiffies_lock, seq));
+                } while (read_seqretry(&xtime_lock, seq));
                clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
                for (;;) {
-                        if (!clockevents_program_event(dev, next, false))
+                        if (!clockevents_program_event(dev, next, ktime_get()))
                                return;
                        next = ktime_add(next, tick_period);
                }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index cf3e59ed6dc..1009b06d6f8 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -26,6 +26,8 @@ extern void clockevents_shutdown(struct clock_event_device *dev);
 extern void tick_setup_oneshot(struct clock_event_device *newdev,
                               void (*handler)(struct clock_event_device *),
                               ktime_t nextevt);
+extern int tick_dev_program_event(struct clock_event_device *dev,
+                                  ktime_t expires, int force);
 extern int tick_program_event(ktime_t expires, int force);
 extern void tick_oneshot_notify(void);
 extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
@@ -141,3 +143,4 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 #endif
 extern void do_timer(unsigned long ticks);
+extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 824109060a3..2d04411a5f0 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -21,6 +21,74 @@
 #include "tick-internal.h"
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT         (NSEC_PER_SEC / HZ)
+static int tick_increase_min_delta(struct clock_event_device *dev)
+{
+        /* Nothing to do if we already reached the limit */
+        if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
+                return -ETIME;
+        if (dev->min_delta_ns < 5000)
+                dev->min_delta_ns = 5000;
+        else
+                dev->min_delta_ns += dev->min_delta_ns >> 1;
+        if (dev->min_delta_ns > MIN_DELTA_LIMIT)
+                dev->min_delta_ns = MIN_DELTA_LIMIT;
+        printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+               dev->name ? dev->name : "?",
+               (unsigned long long) dev->min_delta_ns);
+        return 0;
+}
+/**
+ * tick_program_event internal worker function
+ */
+int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
+                           int force)
+{
+        ktime_t now = ktime_get();
+        int i;
+        for (i = 0;;) {
+                int ret = clockevents_program_event(dev, expires, now);
+                if (!ret || !force)
+                        return ret;
+                dev->retries++;
+                /*
+                 * We tried 3 times to program the device with the given
+                 * min_delta_ns. If that's not working then we increase it
+                 * and emit a warning.
+                 */
+                if (++i > 2) {
+                        /* Increase the min. delta and try again */
+                        if (tick_increase_min_delta(dev)) {
+                                /*
+                                 * Get out of the loop if min_delta_ns
+                                 * hit the limit already. That's
+                                 * better than staying here forever.
+                                 *
+                                 * We clear next_event so we have a
+                                 * chance that the box survives.
+                                 */
+                                printk(KERN_WARNING
+                                       "CE: Reprogramming failure. Giving up\n");
+                                dev->next_event.tv64 = KTIME_MAX;
+                                return -ETIME;
+                        }
+                        i = 0;
+                }
+                now = ktime_get();
+                expires = ktime_add_ns(now, dev->min_delta_ns);
+        }
+}
 /**
 * tick_program_event
 */
@@ -28,7 +96,7 @@ int tick_program_event(ktime_t expires, int force)
 {
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
-        return clockevents_program_event(dev, expires, force);
+        return tick_dev_program_event(dev, expires, force);
 }
 /**
@@ -36,10 +104,11 @@ int tick_program_event(ktime_t expires, int force)
 */
 void tick_resume_oneshot(void)
 {
-        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+        struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+        struct clock_event_device *dev = td->evtdev;
        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
-        clockevents_program_event(dev, ktime_get(), true);
+        tick_program_event(ktime_get(), 1);
 }
 /**
@@ -51,7 +120,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
 {
        newdev->event_handler = handler;
        clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
-        clockevents_program_event(newdev, next_event, true);
+        tick_dev_program_event(newdev, next_event, 1);
 }
 /**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d58e552d9fd..d5097c44b40 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -31,7 +31,7 @@
 static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
 /*
- * The time, when the last jiffy update happened. Protected by jiffies_lock.
+ * The time, when the last jiffy update happened. Protected by xtime_lock.
 */
 static ktime_t last_jiffies_update;
@@ -49,14 +49,14 @@ static void tick_do_update_jiffies64(ktime_t now)
        ktime_t delta;
        /*
-         * Do a quick check without holding jiffies_lock:
+         * Do a quick check without holding xtime_lock:
         */
        delta = ktime_sub(now, last_jiffies_update);
        if (delta.tv64 < tick_period.tv64)
                return;
-        /* Reevalute with jiffies_lock held */
+        /* Reevalute with xtime_lock held */
-        write_seqlock(&jiffies_lock);
+        write_seqlock(&xtime_lock);
        delta = ktime_sub(now, last_jiffies_update);
        if (delta.tv64 >= tick_period.tv64) {
@@ -79,7 +79,7 @@ static void tick_do_update_jiffies64(ktime_t now)
                /* Keep the tick_next_period variable up to date */
                tick_next_period = ktime_add(last_jiffies_update, tick_period);
        }
-        write_sequnlock(&jiffies_lock);
+        write_sequnlock(&xtime_lock);
 }
 /*
@@ -89,58 +89,15 @@ static ktime_t tick_init_jiffy_update(void)
 {
        ktime_t period;
-        write_seqlock(&jiffies_lock);
+        write_seqlock(&xtime_lock);
        /* Did we start the jiffies update yet ? */
        if (last_jiffies_update.tv64 == 0)
                last_jiffies_update = tick_next_period;
        period = last_jiffies_update;
-        write_sequnlock(&jiffies_lock);
+        write_sequnlock(&xtime_lock);
        return period;
 }
-static void tick_sched_do_timer(ktime_t now)
-{
-        int cpu = smp_processor_id();
-#ifdef CONFIG_NO_HZ
-        /*
-         * Check if the do_timer duty was dropped. We don't care about
-         * concurrency: This happens only when the cpu in charge went
-         * into a long sleep. If two cpus happen to assign themself to
-         * this duty, then the jiffies update is still serialized by
-         * jiffies_lock.
-         */
-        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
-                tick_do_timer_cpu = cpu;
-#endif
-        /* Check, if the jiffies need an update */
-        if (tick_do_timer_cpu == cpu)
-                tick_do_update_jiffies64(now);
-}
-static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
-{
-#ifdef CONFIG_NO_HZ
-        /*
-         * When we are idle and the tick is stopped, we have to touch
-         * the watchdog as we might not schedule for a really long
-         * time. This happens on complete idle SMP systems while
-         * waiting on the login prompt. We also increment the "start of
-         * idle" jiffy stamp so the idle accounting adjustment we do
-         * when we go busy again does not account too much ticks.
-         */
-        if (ts->tick_stopped) {
-                touch_softlockup_watchdog();
-                if (is_idle_task(current))
-                        ts->idle_jiffies++;
-        }
-#endif
-        update_process_times(user_mode(regs));
-        profile_tick(CPU_PROFILING);
-}
 /*
 * NOHZ - aka dynamic tick functionality
 */
@@ -148,7 +105,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 /*
 * NO HZ enabled ?
 */
-int tick_nohz_enabled __read_mostly  = 1;
+static int tick_nohz_enabled __read_mostly  = 1;
 /*
 * Enable / Disable tickless mode
@@ -182,6 +139,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        unsigned long flags;
+        cpumask_clear_cpu(cpu, nohz_cpu_mask);
        ts->idle_waketime = now;
        local_irq_save(flags);
@@ -201,10 +159,9 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
        if (ts->idle_active) {
                delta = ktime_sub(now, ts->idle_entrytime);
+                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
                if (nr_iowait_cpu(cpu) > 0)
                        ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
-                else
-                        ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
                ts->idle_entrytime = now;
        }
@@ -225,7 +182,11 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
 static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
 {
-        ktime_t now = ktime_get();
+        ktime_t now;
+        now = ktime_get();
+        update_ts_time_stats(cpu, ts, now, NULL);
        ts->idle_entrytime = now;
        ts->idle_active = 1;
@@ -236,11 +197,11 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
 /**
 * get_cpu_idle_time_us - get the total idle time of a cpu
 * @cpu: CPU number to query
- * @last_update_time: variable to store update time in. Do not update
+ * @last_update_time: variable to store update time in
- * counters if NULL.
 *
 * Return the cummulative idle time (since boot) for a given
- * CPU, in microseconds.
+ * CPU, in microseconds. The idle time returned includes
+ * the iowait time (unlike what "top" and co report).
 *
 * This time is measured via accounting rather than sampling,
 * and is as accurate as ktime_get() is.
@@ -250,35 +211,20 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-        ktime_t now, idle;
        if (!tick_nohz_enabled)
                return -1;
-        now = ktime_get();
+        update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
-        if (last_update_time) {
-                update_ts_time_stats(cpu, ts, now, last_update_time);
-                idle = ts->idle_sleeptime;
-        } else {
-                if (ts->idle_active && !nr_iowait_cpu(cpu)) {
-                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-                        idle = ktime_add(ts->idle_sleeptime, delta);
-                } else {
-                        idle = ts->idle_sleeptime;
-                }
-        }
-        return ktime_to_us(idle);
+        return ktime_to_us(ts->idle_sleeptime);
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
-/**
+/*
 * get_cpu_iowait_time_us - get the total iowait time of a cpu
 * @cpu: CPU number to query
- * @last_update_time: variable to store update time in. Do not update
+ * @last_update_time: variable to store update time in
- * counters if NULL.
 *
 * Return the cummulative iowait time (since boot) for a given
 * CPU, in microseconds.
@@ -291,47 +237,93 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-        ktime_t now, iowait;
        if (!tick_nohz_enabled)
                return -1;
-        now = ktime_get();
+        update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
-        if (last_update_time) {
-                update_ts_time_stats(cpu, ts, now, last_update_time);
-                iowait = ts->iowait_sleeptime;
-        } else {
-                if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
-                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-                        iowait = ktime_add(ts->iowait_sleeptime, delta);
-                } else {
-                        iowait = ts->iowait_sleeptime;
-                }
-        }
-        return ktime_to_us(iowait);
+        return ktime_to_us(ts->iowait_sleeptime);
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
-static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
+/**
-                                         ktime_t now, int cpu)
+ * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ * Called either from the idle loop or from irq_exit() when an idle period was
+ * just interrupted by an interrupt which did not cause a reschedule.
+ */
+void tick_nohz_stop_sched_tick(int inidle)
 {
-        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
+        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
-        ktime_t last_update, expires, ret = { .tv64 = 0 };
+        struct tick_sched *ts;
-        unsigned long rcu_delta_jiffies;
+        ktime_t last_update, expires, now;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
        u64 time_delta;
+        int cpu;
+        local_irq_save(flags);
+        cpu = smp_processor_id();
+        ts = &per_cpu(tick_cpu_sched, cpu);
+        /*
+         * Call to tick_nohz_start_idle stops the last_update_time from being
+         * updated. Thus, it must not be called in the event we are called from
+         * irq_exit() with the prior state different than idle.
+         */
+        if (!inidle && !ts->inidle)
+                goto end;
+        /*
+         * Set ts->inidle unconditionally. Even if the system did not
+         * switch to NOHZ mode the cpu frequency governers rely on the
+         * update of the idle time accounting in tick_nohz_start_idle().
+         */
+        ts->inidle = 1;
+        now = tick_nohz_start_idle(cpu, ts);
+        /*
+         * If this cpu is offline and it is the one which updates
+         * jiffies, then give up the assignment and let it be taken by
+         * the cpu which runs the tick timer next. If we don't drop
+         * this here the jiffies might be stale and do_timer() never
+         * invoked.
+         */
+        if (unlikely(!cpu_online(cpu))) {
+                if (cpu == tick_do_timer_cpu)
+                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+        }
+        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+                goto end;
+        if (need_resched())
+                goto end;
+        if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
+                static int ratelimit;
+                if (ratelimit < 10) {
+                        printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+                               (unsigned int) local_softirq_pending());
+                        ratelimit++;
+                }
+                goto end;
+        }
+        ts->idle_calls++;
        /* Read jiffies and the time when jiffies were updated last */
        do {
-                seq = read_seqbegin(&jiffies_lock);
+                seq = read_seqbegin(&xtime_lock);
                last_update = last_jiffies_update;
                last_jiffies = jiffies;
                time_delta = timekeeping_max_deferment();
-        } while (read_seqretry(&jiffies_lock, seq));
+        } while (read_seqretry(&xtime_lock, seq));
-        if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
+        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
            arch_needs_cpu(cpu)) {
                next_jiffies = last_jiffies + 1;
                delta_jiffies = 1;
@@ -339,10 +331,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                /* Get the next timer wheel timer */
                next_jiffies = get_next_timer_interrupt(last_jiffies);
                delta_jiffies = next_jiffies - last_jiffies;
-                if (rcu_delta_jiffies < delta_jiffies) {
-                        next_jiffies = last_jiffies + rcu_delta_jiffies;
-                        delta_jiffies = rcu_delta_jiffies;
-                }
        }
        /*
         * Do not stop the tick, if we are only one off
@@ -401,12 +389,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                else
                        expires.tv64 = KTIME_MAX;
+                if (delta_jiffies > 1)
+                        cpumask_set_cpu(cpu, nohz_cpu_mask);
                /* Skip reprogram of event if its not changed */
                if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
                        goto out;
-                ret = expires;
                /*
                 * nohz_stop_sched_tick can be called several times before
                 * the nohz_restart_sched_tick is called. This happens when
@@ -415,13 +404,19 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                 * the scheduler tick in nohz_restart_sched_tick.
                 */
                if (!ts->tick_stopped) {
-                        nohz_balance_enter_idle(cpu);
+                        select_nohz_load_balancer(1);
-                        calc_load_enter_idle();
-                        ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
+                        ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
                        ts->tick_stopped = 1;
+                        ts->idle_jiffies = last_jiffies;
+                        rcu_enter_nohz();
                }
+                ts->idle_sleeps++;
+                /* Mark expires */
+                ts->idle_expires = expires;
                /*
                 * If the expiration time == KTIME_MAX, then
                 * in this case we simply stop the tick timer.
@@ -446,132 +441,15 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                 * softirq.
                 */
                tick_do_update_jiffies64(ktime_get());
+                cpumask_clear_cpu(cpu, nohz_cpu_mask);
        }
        raise_softirq_irqoff(TIMER_SOFTIRQ);
 out:
        ts->next_jiffies = next_jiffies;
        ts->last_jiffies = last_jiffies;
        ts->sleep_length = ktime_sub(dev->next_event, now);
+end:
-        return ret;
+        local_irq_restore(flags);
-}
-static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
-{
-        /*
-         * If this cpu is offline and it is the one which updates
-         * jiffies, then give up the assignment and let it be taken by
-         * the cpu which runs the tick timer next. If we don't drop
-         * this here the jiffies might be stale and do_timer() never
-         * invoked.
-         */
-        if (unlikely(!cpu_online(cpu))) {
-                if (cpu == tick_do_timer_cpu)
-                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
-        }
-        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
-                return false;
-        if (need_resched())
-                return false;
-        if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
-                static int ratelimit;
-                if (ratelimit < 10 &&
-                    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
-                        printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-                               (unsigned int) local_softirq_pending());
-                        ratelimit++;
-                }
-                return false;
-        }
-        return true;
-}
-static void __tick_nohz_idle_enter(struct tick_sched *ts)
-{
-        ktime_t now, expires;
-        int cpu = smp_processor_id();
-        now = tick_nohz_start_idle(cpu, ts);
-        if (can_stop_idle_tick(cpu, ts)) {
-                int was_stopped = ts->tick_stopped;
-                ts->idle_calls++;
-                expires = tick_nohz_stop_sched_tick(ts, now, cpu);
-                if (expires.tv64 > 0LL) {
-                        ts->idle_sleeps++;
-                        ts->idle_expires = expires;
-                }
-                if (!was_stopped && ts->tick_stopped)
-                        ts->idle_jiffies = ts->last_jiffies;
-        }
-}
-/**
- * tick_nohz_idle_enter - stop the idle tick from the idle task
- *
- * When the next event is more than a tick into the future, stop the idle tick
- * Called when we start the idle loop.
- *
- * The arch is responsible of calling:
- *
- * - rcu_idle_enter() after its last use of RCU before the CPU is put
- *  to sleep.
- * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
- */
-void tick_nohz_idle_enter(void)
-{
-        struct tick_sched *ts;
-        WARN_ON_ONCE(irqs_disabled());
-        /*
-         * Update the idle state in the scheduler domain hierarchy
-         * when tick_nohz_stop_sched_tick() is called from the idle loop.
-         * State will be updated to busy during the first busy tick after
-         * exiting idle.
-         */
-        set_cpu_sd_state_idle();
-        local_irq_disable();
-        ts = &__get_cpu_var(tick_cpu_sched);
-        /*
-         * set ts->inidle unconditionally. even if the system did not
-         * switch to nohz mode the cpu frequency governers rely on the
-         * update of the idle time accounting in tick_nohz_start_idle().
-         */
-        ts->inidle = 1;
-        __tick_nohz_idle_enter(ts);
-        local_irq_enable();
-}
-/**
- * tick_nohz_irq_exit - update next tick event from interrupt exit
- *
- * When an interrupt fires while we are idle and it doesn't cause
- * a reschedule, it may still add, modify or delete a timer, enqueue
- * an RCU callback, etc...
- * So we need to re-calculate and reprogram the next tick event.
- */
-void tick_nohz_irq_exit(void)
-{
-        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
-        if (!ts->inidle)
-                return;
-        /* Cancel the timer because CPU already waken up from the C-states*/
-        menu_hrtimer_cancel();
-        __tick_nohz_idle_enter(ts);
 }
 /**
@@ -589,7 +467,7 @@ ktime_t tick_nohz_get_sleep_length(void)
 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 {
        hrtimer_cancel(&ts->sched_timer);
-        hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
+        hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
        while (1) {
                /* Forward the time to expire in the future */
@@ -606,33 +484,49 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
                                hrtimer_get_expires(&ts->sched_timer), 0))
                                break;
                }
-                /* Reread time and update jiffies */
+                /* Update jiffies and reread time */
-                now = ktime_get();
                tick_do_update_jiffies64(now);
+                now = ktime_get();
        }
 }
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+/**
+ * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
+ *
+ * Restart the idle tick when the CPU is woken up from idle
+ */
+void tick_nohz_restart_sched_tick(void)
 {
-        /* Update jiffies first */
+        int cpu = smp_processor_id();
-        tick_do_update_jiffies64(now);
+        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-        update_cpu_load_nohz();
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+        unsigned long ticks;
+#endif
+        ktime_t now;
-        calc_load_exit_idle();
+        local_irq_disable();
-        touch_softlockup_watchdog();
+        if (ts->idle_active || (ts->inidle && ts->tick_stopped))
-        /*
+                now = ktime_get();
-         * Cancel the scheduled timer and restore the tick
-         */
-        ts->tick_stopped  = 0;
-        ts->idle_exittime = now;
-        tick_nohz_restart(ts, now);
+        if (ts->idle_active)
-}
+                tick_nohz_stop_idle(cpu, now);
+        if (!ts->inidle || !ts->tick_stopped) {
+                ts->inidle = 0;
+                local_irq_enable();
+                return;
+        }
+        ts->inidle = 0;
+        rcu_exit_nohz();
+        /* Update jiffies first */
+        select_nohz_load_balancer(0);
+        tick_do_update_jiffies64(now);
+        cpumask_clear_cpu(cpu, nohz_cpu_mask);
-static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
-{
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-        unsigned long ticks;
        /*
         * We stopped the tick in idle. Update process times would miss the
         * time we slept as update_process_times does only a 1 tick
@@ -645,39 +539,15 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
        if (ticks && ticks < LONG_MAX)
                account_idle_ticks(ticks);
 #endif
-}
-/**
+        touch_softlockup_watchdog();
- * tick_nohz_idle_exit - restart the idle tick from the idle task
+        /*
- *
+         * Cancel the scheduled timer and restore the tick
- * Restart the idle tick when the CPU is woken up from idle
+         */
- * This also exit the RCU extended quiescent state. The CPU
+        ts->tick_stopped  = 0;
- * can use RCU again after this function is called.
+        ts->idle_exittime = now;
- */
-void tick_nohz_idle_exit(void)
-{
-        int cpu = smp_processor_id();
-        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-        ktime_t now;
-        local_irq_disable();
-        WARN_ON_ONCE(!ts->inidle);
-        ts->inidle = 0;
-        /* Cancel the timer because CPU already waken up from the C-states*/
-        menu_hrtimer_cancel();
-        if (ts->idle_active || ts->tick_stopped)
-                now = ktime_get();
-        if (ts->idle_active)
-                tick_nohz_stop_idle(cpu, now);
-        if (ts->tick_stopped) {
+        tick_nohz_restart(ts, now);
-                tick_nohz_restart_sched_tick(ts, now);
-                tick_nohz_account_idle_ticks(ts);
-        }
        local_irq_enable();
 }
@@ -695,12 +565,40 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 {
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
        struct pt_regs *regs = get_irq_regs();
+        int cpu = smp_processor_id();
        ktime_t now = ktime_get();
        dev->next_event.tv64 = KTIME_MAX;
-        tick_sched_do_timer(now);
+        /*
-        tick_sched_handle(ts, regs);
+         * Check if the do_timer duty was dropped. We don't care about
+         * concurrency: This happens only when the cpu in charge went
+         * into a long sleep. If two cpus happen to assign themself to
+         * this duty, then the jiffies update is still serialized by
+         * xtime_lock.
+         */
+        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+                tick_do_timer_cpu = cpu;
+        /* Check, if the jiffies need an update */
+        if (tick_do_timer_cpu == cpu)
+                tick_do_update_jiffies64(now);
+        /*
+         * When we are idle and the tick is stopped, we have to touch
+         * the watchdog as we might not schedule for a really long
+         * time. This happens on complete idle SMP systems while
+         * waiting on the login prompt. We also increment the "start
+         * of idle" jiffy stamp so the idle accounting adjustment we
+         * do when we go busy again does not account too much ticks.
+         */
+        if (ts->tick_stopped) {
+                touch_softlockup_watchdog();
+                ts->idle_jiffies++;
+        }
+        update_process_times(user_mode(regs));
+        profile_tick(CPU_PROFILING);
        while (tick_nohz_reprogram(ts, now)) {
                now = ktime_get();
@@ -742,6 +640,8 @@ static void tick_nohz_switch_to_nohz(void)
                next = ktime_add(next, tick_period);
        }
        local_irq_enable();
+        printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
 }
 /*
@@ -813,7 +713,7 @@ void tick_check_idle(int cpu)
 #ifdef CONFIG_HIGH_RES_TIMERS
 /*
 * We rearm the timer until we get disabled by the idle code.
- * Called with interrupts disabled.
+ * Called with interrupts disabled and timer->base->cpu_base->lock held.
 */
 static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 {
@@ -821,31 +721,50 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
                container_of(timer, struct tick_sched, sched_timer);
        struct pt_regs *regs = get_irq_regs();
        ktime_t now = ktime_get();
+        int cpu = smp_processor_id();
-        tick_sched_do_timer(now);
+#ifdef CONFIG_NO_HZ
+        /*
+         * Check if the do_timer duty was dropped. We don't care about
+         * concurrency: This happens only when the cpu in charge went
+         * into a long sleep. If two cpus happen to assign themself to
+         * this duty, then the jiffies update is still serialized by
+         * xtime_lock.
+         */
+        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+                tick_do_timer_cpu = cpu;
+#endif
+        /* Check, if the jiffies need an update */
+        if (tick_do_timer_cpu == cpu)
+                tick_do_update_jiffies64(now);
        /*
         * Do not call, when we are not in irq context and have
         * no valid regs pointer
         */
-        if (regs)
+        if (regs) {
-                tick_sched_handle(ts, regs);
+                /*
+                 * When we are idle and the tick is stopped, we have to touch
+                 * the watchdog as we might not schedule for a really long
+                 * time. This happens on complete idle SMP systems while
+                 * waiting on the login prompt. We also increment the "start of
+                 * idle" jiffy stamp so the idle accounting adjustment we do
+                 * when we go busy again does not account too much ticks.
+                 */
+                if (ts->tick_stopped) {
+                        touch_softlockup_watchdog();
+                        ts->idle_jiffies++;
+                }
+                update_process_times(user_mode(regs));
+                profile_tick(CPU_PROFILING);
+        }
        hrtimer_forward(timer, now, tick_period);
        return HRTIMER_RESTART;
 }
-static int sched_skew_tick;
-static int __init skew_tick(char *str)
-{
-        get_option(&str, &sched_skew_tick);
-        return 0;
-}
-early_param("skew_tick", skew_tick);
 /**
 * tick_setup_sched_timer - setup the tick emulation timer
 */
@@ -863,14 +782,6 @@ void tick_setup_sched_timer(void)
        /* Get the next period (per cpu) */
        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
-        /* Offset the tick to avert jiffies_lock contention. */
-        if (sched_skew_tick) {
-                u64 offset = ktime_to_ns(tick_period) >> 1;
-                do_div(offset, num_possible_cpus());
-                offset *= smp_processor_id();
-                hrtimer_add_expires_ns(&ts->sched_timer, offset);
-        }
        for (;;) {
                hrtimer_forward(&ts->sched_timer, now, tick_period);
                hrtimer_start_expires(&ts->sched_timer,
@@ -882,8 +793,10 @@ void tick_setup_sched_timer(void)
        }
 #ifdef CONFIG_NO_HZ
-        if (tick_nohz_enabled)
+        if (tick_nohz_enabled) {
                ts->nohz_mode = NOHZ_MODE_HIGHRES;
+                printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
+        }
 #endif
 }
 #endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cbc6acb0db3..6f9798bf240 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -8,7 +8,6 @@
 *
 */
-#include <linux/timekeeper_internal.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
@@ -21,60 +20,37 @@
 #include <linux/time.h>
 #include <linux/tick.h>
 #include <linux/stop_machine.h>
-#include <linux/pvclock_gtod.h>
+/* Structure holding internal timekeeping values. */
+struct timekeeper {
+        /* Current clocksource used for timekeeping. */
+        struct clocksource *clock;
+        /* The shift value of the current clocksource. */
+        int     shift;
+        /* Number of clock cycles in one NTP interval. */
+        cycle_t cycle_interval;
+        /* Number of clock shifted nano seconds in one NTP interval. */
+        u64     xtime_interval;
+        /* shifted nano seconds left over when rounding cycle_interval */
+        s64     xtime_remainder;
+        /* Raw nano seconds accumulated per NTP interval. */
+        u32     raw_interval;
+        /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
+        u64     xtime_nsec;
+        /* Difference between accumulated time and NTP time in ntp
+         * shifted nano seconds. */
+        s64     ntp_error;
+        /* Shift conversion between clock shifted nano seconds and
+         * ntp shifted nano seconds. */
+        int     ntp_error_shift;
+        /* NTP adjusted clock multiplier */
+        u32     mult;
+};
 static struct timekeeper timekeeper;
-/* flag for if timekeeping is suspended */
-int __read_mostly timekeeping_suspended;
-static inline void tk_normalize_xtime(struct timekeeper *tk)
-{
-        while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
-                tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift;
-                tk->xtime_sec++;
-        }
-}
-static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
-{
-        tk->xtime_sec = ts->tv_sec;
-        tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
-}
-static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
-{
-        tk->xtime_sec += ts->tv_sec;
-        tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
-        tk_normalize_xtime(tk);
-}
-static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
-{
-        struct timespec tmp;
-        /*
-         * Verify consistency of: offset_real = -wall_to_monotonic
-         * before modifying anything
-         */
-        set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec,
-                                        -tk->wall_to_monotonic.tv_nsec);
-        WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64);
-        tk->wall_to_monotonic = wtm;
-        set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
-        tk->offs_real = timespec_to_ktime(tmp);
-}
-static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
-{
-        /* Verify consistency before modifying */
-        WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
-        tk->total_sleep_time    = t;
-        tk->offs_boot           = timespec_to_ktime(t);
-}
 /**
 * timekeeper_setup_internals - Set up internals to use clocksource clock.
 *
@@ -85,14 +61,12 @@ static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
 *
 * Unless you're the timekeeping code, you should not be using this!
 */
-static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
+static void timekeeper_setup_internals(struct clocksource *clock)
 {
        cycle_t interval;
        u64 tmp, ntpinterval;
-        struct clocksource *old_clock;
-        old_clock = tk->clock;
+        timekeeper.clock = clock;
-        tk->clock = clock;
        clock->cycle_last = clock->read(clock);
        /* Do the ns -> cycle conversion first, using original mult */
@@ -105,133 +79,103 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
                tmp = 1;
        interval = (cycle_t) tmp;
-        tk->cycle_interval = interval;
+        timekeeper.cycle_interval = interval;
        /* Go back from cycles -> shifted ns */
-        tk->xtime_interval = (u64) interval * clock->mult;
+        timekeeper.xtime_interval = (u64) interval * clock->mult;
-        tk->xtime_remainder = ntpinterval - tk->xtime_interval;
+        timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
-        tk->raw_interval =
+        timekeeper.raw_interval =
                ((u64) interval * clock->mult) >> clock->shift;
-         /* if changing clocks, convert xtime_nsec shift units */
+        timekeeper.xtime_nsec = 0;
-        if (old_clock) {
+        timekeeper.shift = clock->shift;
-                int shift_change = clock->shift - old_clock->shift;
-                if (shift_change < 0)
-                        tk->xtime_nsec >>= -shift_change;
-                else
-                        tk->xtime_nsec <<= shift_change;
-        }
-        tk->shift = clock->shift;
-        tk->ntp_error = 0;
+        timekeeper.ntp_error = 0;
-        tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+        timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
        /*
         * The timekeeper keeps its own mult values for the currently
         * active clocksource. These value will be adjusted via NTP
         * to counteract clock drifting.
         */
-        tk->mult = clock->mult;
+        timekeeper.mult = clock->mult;
 }
 /* Timekeeper helper functions. */
-static inline s64 timekeeping_get_ns(struct timekeeper *tk)
+static inline s64 timekeeping_get_ns(void)
 {
        cycle_t cycle_now, cycle_delta;
        struct clocksource *clock;
-        s64 nsec;
        /* read clocksource: */
-        clock = tk->clock;
+        clock = timekeeper.clock;
        cycle_now = clock->read(clock);
        /* calculate the delta since the last update_wall_time: */
        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-        nsec = cycle_delta * tk->mult + tk->xtime_nsec;
+        /* return delta convert to nanoseconds using ntp adjusted mult. */
-        nsec >>= tk->shift;
+        return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+                                  timekeeper.shift);
-        /* If arch requires, add in gettimeoffset() */
-        return nsec + arch_gettimeoffset();
 }
-static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
+static inline s64 timekeeping_get_ns_raw(void)
 {
        cycle_t cycle_now, cycle_delta;
        struct clocksource *clock;
-        s64 nsec;
        /* read clocksource: */
-        clock = tk->clock;
+        clock = timekeeper.clock;
        cycle_now = clock->read(clock);
        /* calculate the delta since the last update_wall_time: */
        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-        /* convert delta to nanoseconds. */
+        /* return delta convert to nanoseconds using ntp adjusted mult. */
-        nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+        return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
-        /* If arch requires, add in gettimeoffset() */
-        return nsec + arch_gettimeoffset();
-}
-static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
-static void update_pvclock_gtod(struct timekeeper *tk)
-{
-        raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk);
 }
-/**
+/*
- * pvclock_gtod_register_notifier - register a pvclock timedata update listener
+ * This read-write spinlock protects us from races in SMP while
- *
+ * playing with xtime.
- * Must hold write on timekeeper.lock
 */
-int pvclock_gtod_register_notifier(struct notifier_block *nb)
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
-{
-        struct timekeeper *tk = &timekeeper;
-        unsigned long flags;
-        int ret;
-        write_seqlock_irqsave(&tk->lock, flags);
-        ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
-        /* update timekeeping data */
-        update_pvclock_gtod(tk);
-        write_sequnlock_irqrestore(&tk->lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
-/**
+/*
- * pvclock_gtod_unregister_notifier - unregister a pvclock
+ * The current time
- * timedata update listener
+ * wall_to_monotonic is what we need to add to xtime (or xtime corrected
+ * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
+ * at zero at system boot time, so wall_to_monotonic will be negative,
+ * however, we will ALWAYS keep the tv_nsec part positive so we can use
+ * the usual normalization.
 *
- * Must hold write on timekeeper.lock
+ * wall_to_monotonic is moved after resume from suspend for the monotonic
+ * time not to jump. We need to add total_sleep_time to wall_to_monotonic
+ * to get the real boot based time offset.
+ *
+ * - wall_to_monotonic is no longer the boot time, getboottime must be
+ * used instead.
 */
-int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
+static struct timespec xtime __attribute__ ((aligned (16)));
-{
+static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-        struct timekeeper *tk = &timekeeper;
+static struct timespec total_sleep_time;
-        unsigned long flags;
-        int ret;
-        write_seqlock_irqsave(&tk->lock, flags);
+/*
-        ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
+ * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
-        write_sequnlock_irqrestore(&tk->lock, flags);
+ */
+static struct timespec raw_time;
-        return ret;
+/* flag for if timekeeping is suspended */
-}
+int __read_mostly timekeeping_suspended;
-EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
-/* must hold write on timekeeper.lock */
+/* must hold xtime_lock */
-static void timekeeping_update(struct timekeeper *tk, bool clearntp)
+void timekeeping_leap_insert(int leapsecond)
 {
-        if (clearntp) {
+        xtime.tv_sec += leapsecond;
-                tk->ntp_error = 0;
+        wall_to_monotonic.tv_sec -= leapsecond;
-                ntp_clear();
+        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
-        }
+                        timekeeper.mult);
-        update_vsyscall(tk);
-        update_pvclock_gtod(tk);
 }
 /**
@@ -241,26 +185,27 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 * update_wall_time(). This is useful before significant clock changes,
 * as it avoids having to deal with this time offset explicitly.
 */
-static void timekeeping_forward_now(struct timekeeper *tk)
+static void timekeeping_forward_now(void)
 {
        cycle_t cycle_now, cycle_delta;
        struct clocksource *clock;
        s64 nsec;
-        clock = tk->clock;
+        clock = timekeeper.clock;
        cycle_now = clock->read(clock);
        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
        clock->cycle_last = cycle_now;
-        tk->xtime_nsec += cycle_delta * tk->mult;
+        nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+                                  timekeeper.shift);
        /* If arch requires, add in gettimeoffset() */
-        tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift;
+        nsec += arch_gettimeoffset();
-        tk_normalize_xtime(tk);
+        timespec_add_ns(&xtime, nsec);
        nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
-        timespec_add_ns(&tk->raw_time, nsec);
+        timespec_add_ns(&raw_time, nsec);
 }
 /**
@@ -271,39 +216,43 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 */
 void getnstimeofday(struct timespec *ts)
 {
-        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
-        s64 nsecs = 0;
+        s64 nsecs;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqbegin(&xtime_lock);
-                ts->tv_sec = tk->xtime_sec;
+                *ts = xtime;
-                nsecs = timekeeping_get_ns(tk);
+                nsecs = timekeeping_get_ns();
-        } while (read_seqretry(&tk->lock, seq));
+                /* If arch requires, add in gettimeoffset() */
+                nsecs += arch_gettimeoffset();
+        } while (read_seqretry(&xtime_lock, seq));
-        ts->tv_nsec = 0;
        timespec_add_ns(ts, nsecs);
 }
 EXPORT_SYMBOL(getnstimeofday);
 ktime_t ktime_get(void)
 {
-        struct timekeeper *tk = &timekeeper;
        unsigned int seq;
        s64 secs, nsecs;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqbegin(&xtime_lock);
-                secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+                secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
-                nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
+                nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
+                nsecs += timekeeping_get_ns();
-        } while (read_seqretry(&tk->lock, seq));
+                /* If arch requires, add in gettimeoffset() */
+                nsecs += arch_gettimeoffset();
+        } while (read_seqretry(&xtime_lock, seq));
        /*
         * Use ktime_set/ktime_add_ns to create a proper ktime on
         * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -322,24 +271,24 @@ EXPORT_SYMBOL_GPL(ktime_get);
 */
 void ktime_get_ts(struct timespec *ts)
 {
-        struct timekeeper *tk = &timekeeper;
        struct timespec tomono;
-        s64 nsec;
        unsigned int seq;
+        s64 nsecs;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqbegin(&xtime_lock);
-                ts->tv_sec = tk->xtime_sec;
+                *ts = xtime;
-                nsec = timekeeping_get_ns(tk);
+                tomono = wall_to_monotonic;
-                tomono = tk->wall_to_monotonic;
+                nsecs = timekeeping_get_ns();
+                /* If arch requires, add in gettimeoffset() */
+                nsecs += arch_gettimeoffset();
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqretry(&xtime_lock, seq));
-        ts->tv_sec += tomono.tv_sec;
+        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
-        ts->tv_nsec = 0;
+                                ts->tv_nsec + tomono.tv_nsec + nsecs);
-        timespec_add_ns(ts, nsec + tomono.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
@@ -356,23 +305,28 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
 */
 void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 {
-        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        s64 nsecs_raw, nsecs_real;
        WARN_ON_ONCE(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&tk->lock);
+                u32 arch_offset;
-                *ts_raw = tk->raw_time;
+                seq = read_seqbegin(&xtime_lock);
-                ts_real->tv_sec = tk->xtime_sec;
-                ts_real->tv_nsec = 0;
-                nsecs_raw = timekeeping_get_ns_raw(tk);
+                *ts_raw = raw_time;
-                nsecs_real = timekeeping_get_ns(tk);
+                *ts_real = xtime;
-        } while (read_seqretry(&tk->lock, seq));
+                nsecs_raw = timekeeping_get_ns_raw();
+                nsecs_real = timekeeping_get_ns();
+                /* If arch requires, add in gettimeoffset() */
+                arch_offset = arch_gettimeoffset();
+                nsecs_raw += arch_offset;
+                nsecs_real += arch_offset;
+        } while (read_seqretry(&xtime_lock, seq));
        timespec_add_ns(ts_raw, nsecs_raw);
        timespec_add_ns(ts_real, nsecs_real);
@@ -395,8 +349,8 @@ void do_gettimeofday(struct timeval *tv)
        tv->tv_sec = now.tv_sec;
        tv->tv_usec = now.tv_nsec/1000;
 }
-EXPORT_SYMBOL(do_gettimeofday);
+EXPORT_SYMBOL(do_gettimeofday);
 /**
 * do_settimeofday - Sets the time of day
 * @tv:         pointer to the timespec variable containing the new time
@@ -405,36 +359,39 @@ EXPORT_SYMBOL(do_gettimeofday);
 */
 int do_settimeofday(const struct timespec *tv)
 {
-        struct timekeeper *tk = &timekeeper;
+        struct timespec ts_delta;
-        struct timespec ts_delta, xt;
        unsigned long flags;
-        if (!timespec_valid_strict(tv))
+        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
-        write_seqlock_irqsave(&tk->lock, flags);
+        write_seqlock_irqsave(&xtime_lock, flags);
-        timekeeping_forward_now(tk);
+        timekeeping_forward_now();
-        xt = tk_xtime(tk);
+        ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
-        ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
+        ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
-        ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
+        wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
-        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta));
+        xtime = *tv;
-        tk_set_xtime(tk, tv);
+        timekeeper.ntp_error = 0;
+        ntp_clear();
-        timekeeping_update(tk, true);
+        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+                                timekeeper.mult);
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
        return 0;
 }
 EXPORT_SYMBOL(do_settimeofday);
 /**
 * timekeeping_inject_offset - Adds or subtracts from the current time.
 * @tv:         pointer to the timespec variable containing the offset
@@ -443,37 +400,30 @@ EXPORT_SYMBOL(do_settimeofday);
 */
 int timekeeping_inject_offset(struct timespec *ts)
 {
-        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
-        struct timespec tmp;
-        int ret = 0;
        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
-        write_seqlock_irqsave(&tk->lock, flags);
+        write_seqlock_irqsave(&xtime_lock, flags);
-        timekeeping_forward_now(tk);
+        timekeeping_forward_now();
-        /* Make sure the proposed value is valid */
+        xtime = timespec_add(xtime, *ts);
-        tmp = timespec_add(tk_xtime(tk),  *ts);
+        wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
-        if (!timespec_valid_strict(&tmp)) {
-                ret = -EINVAL;
-                goto error;
-        }
-        tk_xtime_add(tk, ts);
+        timekeeper.ntp_error = 0;
-        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
+        ntp_clear();
-error: /* even if we error out, we forwarded the time, so call update */
+        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
-        timekeeping_update(tk, true);
+                                timekeeper.mult);
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
-        return ret;
+        return 0;
 }
 EXPORT_SYMBOL(timekeeping_inject_offset);
@@ -484,25 +434,17 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
 */
 static int change_clocksource(void *data)
 {
-        struct timekeeper *tk = &timekeeper;
        struct clocksource *new, *old;
-        unsigned long flags;
        new = (struct clocksource *) data;
-        write_seqlock_irqsave(&tk->lock, flags);
+        timekeeping_forward_now();
-        timekeeping_forward_now(tk);
        if (!new->enable || new->enable(new) == 0) {
-                old = tk->clock;
+                old = timekeeper.clock;
-                tk_setup_internals(tk, new);
+                timekeeper_setup_internals(new);
                if (old->disable)
                        old->disable(old);
        }
-        timekeeping_update(tk, true);
-        write_sequnlock_irqrestore(&tk->lock, flags);
        return 0;
 }
@@ -515,9 +457,7 @@ static int change_clocksource(void *data)
 */
 void timekeeping_notify(struct clocksource *clock)
 {
-        struct timekeeper *tk = &timekeeper;
+        if (timekeeper.clock == clock)
-        if (tk->clock == clock)
                return;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
@@ -546,57 +486,48 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
 */
 void getrawmonotonic(struct timespec *ts)
 {
-        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        s64 nsecs;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqbegin(&xtime_lock);
-                nsecs = timekeeping_get_ns_raw(tk);
+                nsecs = timekeeping_get_ns_raw();
-                *ts = tk->raw_time;
+                *ts = raw_time;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqretry(&xtime_lock, seq));
        timespec_add_ns(ts, nsecs);
 }
 EXPORT_SYMBOL(getrawmonotonic);
 /**
 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
 */
 int timekeeping_valid_for_hres(void)
 {
-        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        int ret;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqbegin(&xtime_lock);
-                ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+                ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqretry(&xtime_lock, seq));
        return ret;
 }
 /**
 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
+ *
+ * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
+ * ensure that the clocksource does not change!
 */
 u64 timekeeping_max_deferment(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        return timekeeper.clock->max_idle_ns;
-        unsigned long seq;
-        u64 ret;
-        do {
-                seq = read_seqbegin(&tk->lock);
-                ret = tk->clock->max_idle_ns;
-        } while (read_seqretry(&tk->lock, seq));
-        return ret;
 }
 /**
@@ -634,51 +565,35 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts)
 */
 void __init timekeeping_init(void)
 {
-        struct timekeeper *tk = &timekeeper;
        struct clocksource *clock;
        unsigned long flags;
-        struct timespec now, boot, tmp;
+        struct timespec now, boot;
        read_persistent_clock(&now);
-        if (!timespec_valid_strict(&now)) {
-                pr_warn("WARNING: Persistent clock returned invalid value!\n"
-                        "         Check your CMOS/BIOS settings.\n");
-                now.tv_sec = 0;
-                now.tv_nsec = 0;
-        }
        read_boot_clock(&boot);
-        if (!timespec_valid_strict(&boot)) {
-                pr_warn("WARNING: Boot clock returned invalid value!\n"
-                        "         Check your CMOS/BIOS settings.\n");
-                boot.tv_sec = 0;
-                boot.tv_nsec = 0;
-        }
-        seqlock_init(&tk->lock);
+        write_seqlock_irqsave(&xtime_lock, flags);
        ntp_init();
-        write_seqlock_irqsave(&tk->lock, flags);
        clock = clocksource_default_clock();
        if (clock->enable)
                clock->enable(clock);
-        tk_setup_internals(tk, clock);
+        timekeeper_setup_internals(clock);
-        tk_set_xtime(tk, &now);
+        xtime.tv_sec = now.tv_sec;
-        tk->raw_time.tv_sec = 0;
+        xtime.tv_nsec = now.tv_nsec;
-        tk->raw_time.tv_nsec = 0;
+        raw_time.tv_sec = 0;
-        if (boot.tv_sec == 0 && boot.tv_nsec == 0)
+        raw_time.tv_nsec = 0;
-                boot = tk_xtime(tk);
+        if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
+                boot.tv_sec = xtime.tv_sec;
-        set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec);
+                boot.tv_nsec = xtime.tv_nsec;
-        tk_set_wall_to_mono(tk, tmp);
+        }
+        set_normalized_timespec(&wall_to_monotonic,
-        tmp.tv_sec = 0;
+                                -boot.tv_sec, -boot.tv_nsec);
-        tmp.tv_nsec = 0;
+        total_sleep_time.tv_sec = 0;
-        tk_set_sleep_time(tk, tmp);
+        total_sleep_time.tv_nsec = 0;
+        write_sequnlock_irqrestore(&xtime_lock, flags);
-        write_sequnlock_irqrestore(&tk->lock, flags);
 }
 /* time in seconds when suspend began */
@@ -691,19 +606,20 @@ static struct timespec timekeeping_suspend_time;
 * Takes a timespec offset measuring a suspend interval and properly
 * adds the sleep offset to the timekeeping variables.
 */
-static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
+static void __timekeeping_inject_sleeptime(struct timespec *delta)
-                                                        struct timespec *delta)
 {
-        if (!timespec_valid_strict(delta)) {
+        if (!timespec_valid(delta)) {
                printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
                                        "sleep delta value!\n");
                return;
        }
-        tk_xtime_add(tk, delta);
-        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
+        xtime = timespec_add(xtime, *delta);
-        tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
+        wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
+        total_sleep_time = timespec_add(total_sleep_time, *delta);
 }
 /**
 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
 * @delta: pointer to a timespec delta value
@@ -716,7 +632,6 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
 */
 void timekeeping_inject_sleeptime(struct timespec *delta)
 {
-        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        struct timespec ts;
@@ -725,20 +640,23 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
        if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
                return;
-        write_seqlock_irqsave(&tk->lock, flags);
+        write_seqlock_irqsave(&xtime_lock, flags);
+        timekeeping_forward_now();
-        timekeeping_forward_now(tk);
-        __timekeeping_inject_sleeptime(tk, delta);
+        __timekeeping_inject_sleeptime(delta);
-        timekeeping_update(tk, true);
+        timekeeper.ntp_error = 0;
+        ntp_clear();
+        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+                                timekeeper.mult);
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
 }
 /**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 *
@@ -748,27 +666,24 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 */
 static void timekeeping_resume(void)
 {
-        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        struct timespec ts;
        read_persistent_clock(&ts);
-        clockevents_resume();
        clocksource_resume();
-        write_seqlock_irqsave(&tk->lock, flags);
+        write_seqlock_irqsave(&xtime_lock, flags);
        if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
                ts = timespec_sub(ts, timekeeping_suspend_time);
-                __timekeeping_inject_sleeptime(tk, &ts);
+                __timekeeping_inject_sleeptime(&ts);
        }
        /* re-base the last cycle value */
-        tk->clock->cycle_last = tk->clock->read(tk->clock);
+        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
-        tk->ntp_error = 0;
+        timekeeper.ntp_error = 0;
        timekeeping_suspended = 0;
-        timekeeping_update(tk, false);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
-        write_sequnlock_irqrestore(&tk->lock, flags);
        touch_softlockup_watchdog();
@@ -780,15 +695,14 @@ static void timekeeping_resume(void)
 static int timekeeping_suspend(void)
 {
-        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        struct timespec         delta, delta_delta;
        static struct timespec  old_delta;
        read_persistent_clock(&timekeeping_suspend_time);
-        write_seqlock_irqsave(&tk->lock, flags);
+        write_seqlock_irqsave(&xtime_lock, flags);
-        timekeeping_forward_now(tk);
+        timekeeping_forward_now();
        timekeeping_suspended = 1;
        /*
@@ -797,7 +711,7 @@ static int timekeeping_suspend(void)
         * try to compensate so the difference in system time
         * and persistent_clock time stays close to constant.
         */
-        delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time);
+        delta = timespec_sub(xtime, timekeeping_suspend_time);
        delta_delta = timespec_sub(delta, old_delta);
        if (abs(delta_delta.tv_sec)  >= 2) {
                /*
@@ -810,11 +724,10 @@ static int timekeeping_suspend(void)
                timekeeping_suspend_time =
                        timespec_add(timekeeping_suspend_time, delta_delta);
        }
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
        clocksource_suspend();
-        clockevents_suspend();
        return 0;
 }
@@ -837,8 +750,7 @@ device_initcall(timekeeping_init_ops);
 * If the error is already larger, we look ahead even further
 * to compensate for late or lost adjustments.
 */
-static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
+static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
-                                                 s64 error, s64 *interval,
                                                 s64 *offset)
 {
        s64 tick_error, i;
@@ -854,7 +766,7 @@ static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
         * here.  This is tuned so that an error of about 1 msec is adjusted
         * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
         */
-        error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
+        error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
        error2 = abs(error2);
        for (look_ahead = 0; error2 > 0; look_ahead++)
                error2 >>= 2;
@@ -863,8 +775,8 @@ static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
         * Now calculate the error in (1 << look_ahead) ticks, but first
         * remove the single look ahead already included in the error.
         */
-        tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
+        tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
-        tick_error -= tk->xtime_interval >> 1;
+        tick_error -= timekeeper.xtime_interval >> 1;
        error = ((error - tick_error) >> look_ahead) + tick_error;
        /* Finally calculate the adjustment shift value.  */
@@ -889,181 +801,43 @@ static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
 * this is optimized for the most common adjustments of -1,0,1,
 * for other values we can do a bit more work.
 */
-static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
+static void timekeeping_adjust(s64 offset)
 {
-        s64 error, interval = tk->cycle_interval;
+        s64 error, interval = timekeeper.cycle_interval;
        int adj;
-        /*
+        error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
-         * The point of this is to check if the error is greater than half
-         * an interval.
-         *
-         * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
-         *
-         * Note we subtract one in the shift, so that error is really error*2.
-         * This "saves" dividing(shifting) interval twice, but keeps the
-         * (error > interval) comparison as still measuring if error is
-         * larger than half an interval.
-         *
-         * Note: It does not "save" on aggravation when reading the code.
-         */
-        error = tk->ntp_error >> (tk->ntp_error_shift - 1);
        if (error > interval) {
-                /*
-                 * We now divide error by 4(via shift), which checks if
-                 * the error is greater than twice the interval.
-                 * If it is greater, we need a bigadjust, if its smaller,
-                 * we can adjust by 1.
-                 */
                error >>= 2;
-                /*
-                 * XXX - In update_wall_time, we round up to the next
-                 * nanosecond, and store the amount rounded up into
-                 * the error. This causes the likely below to be unlikely.
-                 *
-                 * The proper fix is to avoid rounding up by using
-                 * the high precision tk->xtime_nsec instead of
-                 * xtime.tv_nsec everywhere. Fixing this will take some
-                 * time.
-                 */
                if (likely(error <= interval))
                        adj = 1;
                else
-                        adj = timekeeping_bigadjust(tk, error, &interval, &offset);
+                        adj = timekeeping_bigadjust(error, &interval, &offset);
-        } else {
+        } else if (error < -interval) {
-                if (error < -interval) {
+                error >>= 2;
-                        /* See comment above, this is just switched for the negative */
+                if (likely(error >= -interval)) {
-                        error >>= 2;
+                        adj = -1;
-                        if (likely(error >= -interval)) {
+                        interval = -interval;
-                                adj = -1;
+                        offset = -offset;
-                                interval = -interval;
+                } else
-                                offset = -offset;
+                        adj = timekeeping_bigadjust(error, &interval, &offset);
-                        } else {
+        } else
-                                adj = timekeeping_bigadjust(tk, error, &interval, &offset);
+                return;
-                        }
-                } else {
-                        goto out_adjust;
-                }
-        }
-        if (unlikely(tk->clock->maxadj &&
-                (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
-                printk_once(KERN_WARNING
-                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
-                        tk->clock->name, (long)tk->mult + adj,
-                        (long)tk->clock->mult + tk->clock->maxadj);
-        }
-        /*
-         * So the following can be confusing.
-         *
-         * To keep things simple, lets assume adj == 1 for now.
-         *
-         * When adj != 1, remember that the interval and offset values
-         * have been appropriately scaled so the math is the same.
-         *
-         * The basic idea here is that we're increasing the multiplier
-         * by one, this causes the xtime_interval to be incremented by
-         * one cycle_interval. This is because:
-         *      xtime_interval = cycle_interval * mult
-         * So if mult is being incremented by one:
-         *      xtime_interval = cycle_interval * (mult + 1)
-         * Its the same as:
-         *      xtime_interval = (cycle_interval * mult) + cycle_interval
-         * Which can be shortened to:
-         *      xtime_interval += cycle_interval
-         *
-         * So offset stores the non-accumulated cycles. Thus the current
-         * time (in shifted nanoseconds) is:
-         *      now = (offset * adj) + xtime_nsec
-         * Now, even though we're adjusting the clock frequency, we have
-         * to keep time consistent. In other words, we can't jump back
-         * in time, and we also want to avoid jumping forward in time.
-         *
-         * So given the same offset value, we need the time to be the same
-         * both before and after the freq adjustment.
-         *      now = (offset * adj_1) + xtime_nsec_1
-         *      now = (offset * adj_2) + xtime_nsec_2
-         * So:
-         *      (offset * adj_1) + xtime_nsec_1 =
-         *              (offset * adj_2) + xtime_nsec_2
-         * And we know:
-         *      adj_2 = adj_1 + 1
-         * So:
-         *      (offset * adj_1) + xtime_nsec_1 =
-         *              (offset * (adj_1+1)) + xtime_nsec_2
-         *      (offset * adj_1) + xtime_nsec_1 =
-         *              (offset * adj_1) + offset + xtime_nsec_2
-         * Canceling the sides:
-         *      xtime_nsec_1 = offset + xtime_nsec_2
-         * Which gives us:
-         *      xtime_nsec_2 = xtime_nsec_1 - offset
-         * Which simplfies to:
-         *      xtime_nsec -= offset
-         *
-         * XXX - TODO: Doc ntp_error calculation.
-         */
-        tk->mult += adj;
-        tk->xtime_interval += interval;
-        tk->xtime_nsec -= offset;
-        tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
-out_adjust:
-        /*
-         * It may be possible that when we entered this function, xtime_nsec
-         * was very small.  Further, if we're slightly speeding the clocksource
-         * in the code above, its possible the required corrective factor to
-         * xtime_nsec could cause it to underflow.
-         *
-         * Now, since we already accumulated the second, cannot simply roll
-         * the accumulated second back, since the NTP subsystem has been
-         * notified via second_overflow. So instead we push xtime_nsec forward
-         * by the amount we underflowed, and add that amount into the error.
-         *
-         * We'll correct this error next time through this function, when
-         * xtime_nsec is not as small.
-         */
-        if (unlikely((s64)tk->xtime_nsec < 0)) {
-                s64 neg = -(s64)tk->xtime_nsec;
-                tk->xtime_nsec = 0;
-                tk->ntp_error += neg << tk->ntp_error_shift;
-        }
+        WARN_ONCE(timekeeper.clock->maxadj &&
+                        (timekeeper.mult + adj > timekeeper.clock->mult +
+                                                timekeeper.clock->maxadj),
+                        "Adjusting %s more then 11%% (%ld vs %ld)\n",
+                        timekeeper.clock->name, (long)timekeeper.mult + adj,
+                        (long)timekeeper.clock->mult +
+                                timekeeper.clock->maxadj);
+        timekeeper.mult += adj;
+        timekeeper.xtime_interval += interval;
+        timekeeper.xtime_nsec -= offset;
+        timekeeper.ntp_error -= (interval - offset) <<
+                                timekeeper.ntp_error_shift;
 }
-/**
- * accumulate_nsecs_to_secs - Accumulates nsecs into secs
- *
- * Helper function that accumulates a the nsecs greater then a second
- * from the xtime_nsec field to the xtime_secs field.
- * It also calls into the NTP code to handle leapsecond processing.
- *
- */
-static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
-{
-        u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
-        while (tk->xtime_nsec >= nsecps) {
-                int leap;
-                tk->xtime_nsec -= nsecps;
-                tk->xtime_sec++;
-                /* Figure out if its a leap sec and apply if needed */
-                leap = second_overflow(tk->xtime_sec);
-                if (unlikely(leap)) {
-                        struct timespec ts;
-                        tk->xtime_sec += leap;
-                        ts.tv_sec = leap;
-                        ts.tv_nsec = 0;
-                        tk_set_wall_to_mono(tk,
-                                timespec_sub(tk->wall_to_monotonic, ts));
-                        clock_was_set_delayed();
-                }
-        }
-}
 /**
 * logarithmic_accumulation - shifted accumulation of cycles
@@ -1074,136 +848,137 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
 *
 * Returns the unconsumed cycles.
 */
-static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
+static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
-                                                u32 shift)
 {
+        u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
        u64 raw_nsecs;
        /* If the offset is smaller then a shifted interval, do nothing */
-        if (offset < tk->cycle_interval<<shift)
+        if (offset < timekeeper.cycle_interval<<shift)
                return offset;
        /* Accumulate one shifted interval */
-        offset -= tk->cycle_interval << shift;
+        offset -= timekeeper.cycle_interval << shift;
-        tk->clock->cycle_last += tk->cycle_interval << shift;
+        timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
-        tk->xtime_nsec += tk->xtime_interval << shift;
+        timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
-        accumulate_nsecs_to_secs(tk);
+        while (timekeeper.xtime_nsec >= nsecps) {
+                timekeeper.xtime_nsec -= nsecps;
+                xtime.tv_sec++;
+                second_overflow();
+        }
        /* Accumulate raw time */
-        raw_nsecs = (u64)tk->raw_interval << shift;
+        raw_nsecs = timekeeper.raw_interval << shift;
-        raw_nsecs += tk->raw_time.tv_nsec;
+        raw_nsecs += raw_time.tv_nsec;
        if (raw_nsecs >= NSEC_PER_SEC) {
                u64 raw_secs = raw_nsecs;
                raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
-                tk->raw_time.tv_sec += raw_secs;
+                raw_time.tv_sec += raw_secs;
        }
-        tk->raw_time.tv_nsec = raw_nsecs;
+        raw_time.tv_nsec = raw_nsecs;
        /* Accumulate error between NTP and clock interval */
-        tk->ntp_error += ntp_tick_length() << shift;
+        timekeeper.ntp_error += tick_length << shift;
-        tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
+        timekeeper.ntp_error -=
-                                                (tk->ntp_error_shift + shift);
+            (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
+                                (timekeeper.ntp_error_shift + shift);
        return offset;
 }
-#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
-static inline void old_vsyscall_fixup(struct timekeeper *tk)
-{
-        s64 remainder;
-        /*
-        * Store only full nanoseconds into xtime_nsec after rounding
-        * it up and add the remainder to the error difference.
-        * XXX - This is necessary to avoid small 1ns inconsistnecies caused
-        * by truncating the remainder in vsyscalls. However, it causes
-        * additional work to be done in timekeeping_adjust(). Once
-        * the vsyscall implementations are converted to use xtime_nsec
-        * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
-        * users are removed, this can be killed.
-        */
-        remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
-        tk->xtime_nsec -= remainder;
-        tk->xtime_nsec += 1ULL << tk->shift;
-        tk->ntp_error += remainder << tk->ntp_error_shift;
-}
-#else
-#define old_vsyscall_fixup(tk)
-#endif
 /**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
+ * Called from the timer interrupt, must hold a write on xtime_lock.
 */
 static void update_wall_time(void)
 {
        struct clocksource *clock;
-        struct timekeeper *tk = &timekeeper;
        cycle_t offset;
        int shift = 0, maxshift;
-        unsigned long flags;
-        write_seqlock_irqsave(&tk->lock, flags);
        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
-                goto out;
+                return;
-        clock = tk->clock;
+        clock = timekeeper.clock;
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-        offset = tk->cycle_interval;
+        offset = timekeeper.cycle_interval;
 #else
        offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #endif
+        timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
-        /* Check if there's really nothing to do */
-        if (offset < tk->cycle_interval)
-                goto out;
        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
         * we calculate the largest doubling multiple of cycle_intervals
-         * that is smaller than the offset.  We then accumulate that
+         * that is smaller then the offset. We then accumulate that
         * chunk in one go, and then try to consume the next smaller
         * doubled multiple.
         */
-        shift = ilog2(offset) - ilog2(tk->cycle_interval);
+        shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
        shift = max(0, shift);
-        /* Bound shift to one less than what overflows tick_length */
+        /* Bound shift to one less then what overflows tick_length */
-        maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
+        maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
        shift = min(shift, maxshift);
-        while (offset >= tk->cycle_interval) {
+        while (offset >= timekeeper.cycle_interval) {
-                offset = logarithmic_accumulation(tk, offset, shift);
+                offset = logarithmic_accumulation(offset, shift);
-                if (offset < tk->cycle_interval<<shift)
+                if(offset < timekeeper.cycle_interval<<shift)
                        shift--;
        }
        /* correct the clock when NTP error is too big */
-        timekeeping_adjust(tk, offset);
+        timekeeping_adjust(offset);
        /*
-         * XXX This can be killed once everyone converts
+         * Since in the loop above, we accumulate any amount of time
-         * to the new update_vsyscall.
+         * in xtime_nsec over a second into xtime.tv_sec, its possible for
+         * xtime_nsec to be fairly small after the loop. Further, if we're
+         * slightly speeding the clocksource up in timekeeping_adjust(),
+         * its possible the required corrective factor to xtime_nsec could
+         * cause it to underflow.
+         *
+         * Now, we cannot simply roll the accumulated second back, since
+         * the NTP subsystem has been notified via second_overflow. So
+         * instead we push xtime_nsec forward by the amount we underflowed,
+         * and add that amount into the error.
+         *
+         * We'll correct this error next time through this function, when
+         * xtime_nsec is not as small.
         */
-        old_vsyscall_fixup(tk);
+        if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
+                s64 neg = -(s64)timekeeper.xtime_nsec;
+                timekeeper.xtime_nsec = 0;
+                timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
+        }
        /*
-         * Finally, make sure that after the rounding
+         * Store full nanoseconds into xtime after rounding it up and
-         * xtime_nsec isn't larger than NSEC_PER_SEC
+         * add the remainder to the error difference.
         */
-        accumulate_nsecs_to_secs(tk);
+        xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
+        timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
+        timekeeper.ntp_error += timekeeper.xtime_nsec <<
+                                timekeeper.ntp_error_shift;
-        timekeeping_update(tk, false);
+        /*
+         * Finally, make sure that after the rounding
-out:
+         * xtime.tv_nsec isn't larger then NSEC_PER_SEC
-        write_sequnlock_irqrestore(&tk->lock, flags);
+         */
+        if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
+                xtime.tv_nsec -= NSEC_PER_SEC;
+                xtime.tv_sec++;
+                second_overflow();
+        }
+        /* check to see if there is a new clocksource to use */
+        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+                                timekeeper.mult);
 }
 /**
@@ -1219,18 +994,16 @@ out:
 */
 void getboottime(struct timespec *ts)
 {
-        struct timekeeper *tk = &timekeeper;
        struct timespec boottime = {
-                .tv_sec = tk->wall_to_monotonic.tv_sec +
+                .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
-                                tk->total_sleep_time.tv_sec,
+                .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
-                .tv_nsec = tk->wall_to_monotonic.tv_nsec +
-                                tk->total_sleep_time.tv_nsec
        };
        set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(getboottime);
 /**
 * get_monotonic_boottime - Returns monotonic time since boot
 * @ts:         pointer to the timespec to be set
@@ -1242,25 +1015,23 @@ EXPORT_SYMBOL_GPL(getboottime);
 */
 void get_monotonic_boottime(struct timespec *ts)
 {
-        struct timekeeper *tk = &timekeeper;
        struct timespec tomono, sleep;
-        s64 nsec;
        unsigned int seq;
+        s64 nsecs;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqbegin(&xtime_lock);
-                ts->tv_sec = tk->xtime_sec;
+                *ts = xtime;
-                nsec = timekeeping_get_ns(tk);
+                tomono = wall_to_monotonic;
-                tomono = tk->wall_to_monotonic;
+                sleep = total_sleep_time;
-                sleep = tk->total_sleep_time;
+                nsecs = timekeeping_get_ns();
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqretry(&xtime_lock, seq));
-        ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
+        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
-        ts->tv_nsec = 0;
+                        ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
-        timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(get_monotonic_boottime);
@@ -1287,38 +1058,31 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
 */
 void monotonic_to_bootbased(struct timespec *ts)
 {
-        struct timekeeper *tk = &timekeeper;
+        *ts = timespec_add(*ts, total_sleep_time);
-        *ts = timespec_add(*ts, tk->total_sleep_time);
 }
 EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 unsigned long get_seconds(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        return xtime.tv_sec;
-        return tk->xtime_sec;
 }
 EXPORT_SYMBOL(get_seconds);
 struct timespec __current_kernel_time(void)
 {
-        struct timekeeper *tk = &timekeeper;
+        return xtime;
-        return tk_xtime(tk);
 }
 struct timespec current_kernel_time(void)
 {
-        struct timekeeper *tk = &timekeeper;
        struct timespec now;
        unsigned long seq;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqbegin(&xtime_lock);
-                now = tk_xtime(tk);
+                now = xtime;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqretry(&xtime_lock, seq));
        return now;
 }
@@ -1326,16 +1090,15 @@ EXPORT_SYMBOL(current_kernel_time);
 struct timespec get_monotonic_coarse(void)
 {
-        struct timekeeper *tk = &timekeeper;
        struct timespec now, mono;
        unsigned long seq;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqbegin(&xtime_lock);
-                now = tk_xtime(tk);
+                now = xtime;
-                mono = tk->wall_to_monotonic;
+                mono = wall_to_monotonic;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqretry(&xtime_lock, seq));
        set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
                                now.tv_nsec + mono.tv_nsec);
@@ -1343,7 +1106,9 @@ struct timespec get_monotonic_coarse(void)
 }
 /*
- * Must hold jiffies_lock
+ * The 64-bit jiffies value is not atomic - you MUST NOT read it
+ * without sampling the sequence number in xtime_lock.
+ * jiffies is defined in the linker script...
 */
 void do_timer(unsigned long ticks)
 {
@@ -1362,66 +1127,30 @@ void do_timer(unsigned long ticks)
 void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
                                struct timespec *wtom, struct timespec *sleep)
 {
-        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqbegin(&xtime_lock);
-                *xtim = tk_xtime(tk);
+                *xtim = xtime;
-                *wtom = tk->wall_to_monotonic;
+                *wtom = wall_to_monotonic;
-                *sleep = tk->total_sleep_time;
+                *sleep = total_sleep_time;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqretry(&xtime_lock, seq));
-}
-#ifdef CONFIG_HIGH_RES_TIMERS
-/**
- * ktime_get_update_offsets - hrtimer helper
- * @offs_real:  pointer to storage for monotonic -> realtime offset
- * @offs_boot:  pointer to storage for monotonic -> boottime offset
- *
- * Returns current monotonic time and updates the offsets
- * Called from hrtimer_interupt() or retrigger_next_event()
- */
-ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
-{
-        struct timekeeper *tk = &timekeeper;
-        ktime_t now;
-        unsigned int seq;
-        u64 secs, nsecs;
-        do {
-                seq = read_seqbegin(&tk->lock);
-                secs = tk->xtime_sec;
-                nsecs = timekeeping_get_ns(tk);
-                *offs_real = tk->offs_real;
-                *offs_boot = tk->offs_boot;
-        } while (read_seqretry(&tk->lock, seq));
-        now = ktime_add_ns(ktime_set(secs, 0), nsecs);
-        now = ktime_sub(now, *offs_real);
-        return now;
 }
-#endif
 /**
 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
 */
 ktime_t ktime_get_monotonic_offset(void)
 {
-        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        struct timespec wtom;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqbegin(&xtime_lock);
-                wtom = tk->wall_to_monotonic;
+                wtom = wall_to_monotonic;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqretry(&xtime_lock, seq));
        return timespec_to_ktime(wtom);
 }
-EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
 /**
 * xtime_update() - advances the timekeeping infrastructure
@@ -1431,7 +1160,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
 */
 void xtime_update(unsigned long ticks)
 {
-        write_seqlock(&jiffies_lock);
+        write_seqlock(&xtime_lock);
        do_timer(ticks);
-        write_sequnlock(&jiffies_lock);
+        write_sequnlock(&xtime_lock);
 }
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index af5a7e9f164..3258455549f 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
        {
                struct tick_sched *ts = tick_get_tick_sched(cpu);
                P(nohz_mode);
-                P_ns(last_tick);
+                P_ns(idle_tick);
                P(tick_stopped);
                P(idle_jiffies);
                P(idle_calls);
@@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v)
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
-        SEQ_printf(m, "Timer List Version: v0.7\n");
+        SEQ_printf(m, "Timer List Version: v0.6\n");
        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 0b537f27b55..a5d0a3a85dd 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -81,7 +81,7 @@ struct entry {
 /*
 * Spinlock protecting the tables - not taken during lookup:
 */
-static DEFINE_RAW_SPINLOCK(table_lock);
+static DEFINE_SPINLOCK(table_lock);
 /*
 * Per-CPU lookup locks for fast hash lookup:
@@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
        prev = NULL;
        curr = *head;
-        raw_spin_lock(&table_lock);
+        spin_lock(&table_lock);
        /*
         * Make sure we have not raced with another CPU:
         */
@@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
                        *head = curr;
        }
 out_unlock:
-        raw_spin_unlock(&table_lock);
+        spin_unlock(&table_lock);
        return curr;
 }
diff --git a/kernel/timer.c b/kernel/timer.c
index 367d0085848..8cff36119e4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -20,7 +20,7 @@
 */
 #include <linux/kernel_stat.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
@@ -63,7 +63,6 @@ EXPORT_SYMBOL(jiffies_64);
 #define TVR_SIZE (1 << TVR_BITS)
 #define TVN_MASK (TVN_SIZE - 1)
 #define TVR_MASK (TVR_SIZE - 1)
-#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
 struct tvec {
        struct list_head vec[TVN_SIZE];
@@ -78,7 +77,6 @@ struct tvec_base {
        struct timer_list *running_timer;
        unsigned long timer_jiffies;
        unsigned long next_timer;
-        unsigned long active_timers;
        struct tvec_root tv1;
        struct tvec tv2;
        struct tvec tv3;
@@ -93,25 +91,24 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
 {
-        return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
+        return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
 }
-static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
+static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
 {
-        return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
+        return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
 }
-static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
+static inline void timer_set_deferrable(struct timer_list *timer)
 {
-        return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
+        timer->base = TBASE_MAKE_DEFERRED(timer->base);
 }
 static inline void
 timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
 {
-        unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
+        timer->base = (struct tvec_base *)((unsigned long)(new_base) |
+                                      tbase_get_deferrable(timer->base));
-        timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
 }
 static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -333,8 +330,7 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
-static void
+static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
-__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
        unsigned long expires = timer->expires;
        unsigned long idx = expires - base->timer_jiffies;
@@ -360,12 +356,11 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
                vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
        } else {
                int i;
-                /* If the timeout is larger than MAX_TVAL (on 64-bit
+                /* If the timeout is larger than 0xffffffff on 64-bit
-                 * architectures or with CONFIG_BASE_SMALL=1) then we
+                 * architectures then we use the maximum timeout:
-                 * use the maximum timeout.
                 */
-                if (idx > MAX_TVAL) {
+                if (idx > 0xffffffffUL) {
-                        idx = MAX_TVAL;
+                        idx = 0xffffffffUL;
                        expires = idx + base->timer_jiffies;
                }
                i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
@@ -377,19 +372,6 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
        list_add_tail(&timer->entry, vec);
 }
-static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
-{
-        __internal_add_timer(base, timer);
-        /*
-         * Update base->active_timers and base->next_timer
-         */
-        if (!tbase_get_deferrable(timer->base)) {
-                if (time_before(timer->expires, base->next_timer))
-                        base->next_timer = timer->expires;
-                base->active_timers++;
-        }
-}
 #ifdef CONFIG_TIMER_STATS
 void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
 {
@@ -445,12 +427,6 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
        }
 }
-/* Stub timer callback for improperly used timers. */
-static void stub_timer(unsigned long data)
-{
-        WARN_ON(1);
-}
 /*
 * fixup_activate is called when:
 * - an active object is activated
@@ -474,8 +450,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
                        debug_object_activate(timer, &timer_debug_descr);
                        return 0;
                } else {
-                        setup_timer(timer, stub_timer, 0);
+                        WARN_ON_ONCE(1);
-                        return 1;
                }
                return 0;
@@ -505,40 +480,12 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
        }
 }
-/*
- * fixup_assert_init is called when:
- * - an untracked/uninit-ed object is found
- */
-static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
-{
-        struct timer_list *timer = addr;
-        switch (state) {
-        case ODEBUG_STATE_NOTAVAILABLE:
-                if (timer->entry.prev == TIMER_ENTRY_STATIC) {
-                        /*
-                         * This is not really a fixup. The timer was
-                         * statically initialized. We just make sure that it
-                         * is tracked in the object tracker.
-                         */
-                        debug_object_init(timer, &timer_debug_descr);
-                        return 0;
-                } else {
-                        setup_timer(timer, stub_timer, 0);
-                        return 1;
-                }
-        default:
-                return 0;
-        }
-}
 static struct debug_obj_descr timer_debug_descr = {
-        .name                   = "timer_list",
+        .name           = "timer_list",
-        .debug_hint             = timer_debug_hint,
+        .debug_hint     = timer_debug_hint,
-        .fixup_init             = timer_fixup_init,
+        .fixup_init     = timer_fixup_init,
-        .fixup_activate         = timer_fixup_activate,
+        .fixup_activate = timer_fixup_activate,
-        .fixup_free             = timer_fixup_free,
+        .fixup_free     = timer_fixup_free,
-        .fixup_assert_init      = timer_fixup_assert_init,
 };
 static inline void debug_timer_init(struct timer_list *timer)
@@ -561,19 +508,16 @@ static inline void debug_timer_free(struct timer_list *timer)
        debug_object_free(timer, &timer_debug_descr);
 }
-static inline void debug_timer_assert_init(struct timer_list *timer)
+static void __init_timer(struct timer_list *timer,
-{
+                         const char *name,
-        debug_object_assert_init(timer, &timer_debug_descr);
+                         struct lock_class_key *key);
-}
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
-                          const char *name, struct lock_class_key *key);
-void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
+void init_timer_on_stack_key(struct timer_list *timer,
-                             const char *name, struct lock_class_key *key)
+                             const char *name,
+                             struct lock_class_key *key)
 {
        debug_object_init_on_stack(timer, &timer_debug_descr);
-        do_init_timer(timer, flags, name, key);
+        __init_timer(timer, name, key);
 }
 EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
@@ -587,7 +531,6 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
 static inline void debug_timer_init(struct timer_list *timer) { }
 static inline void debug_timer_activate(struct timer_list *timer) { }
 static inline void debug_timer_deactivate(struct timer_list *timer) { }
-static inline void debug_timer_assert_init(struct timer_list *timer) { }
 #endif
 static inline void debug_init(struct timer_list *timer)
@@ -609,18 +552,12 @@ static inline void debug_deactivate(struct timer_list *timer)
        trace_timer_cancel(timer);
 }
-static inline void debug_assert_init(struct timer_list *timer)
+static void __init_timer(struct timer_list *timer,
-{
+                         const char *name,
-        debug_timer_assert_init(timer);
+                         struct lock_class_key *key)
-}
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
-                          const char *name, struct lock_class_key *key)
 {
-        struct tvec_base *base = __raw_get_cpu_var(tvec_bases);
        timer->entry.next = NULL;
-        timer->base = (void *)((unsigned long)base | flags);
+        timer->base = __raw_get_cpu_var(tvec_bases);
        timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
        timer->start_site = NULL;
@@ -630,10 +567,22 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
        lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
+void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
+                                         const char *name,
+                                         struct lock_class_key *key,
+                                         void (*function)(unsigned long),
+                                         unsigned long data)
+{
+        timer->function = function;
+        timer->data = data;
+        init_timer_on_stack_key(timer, name, key);
+        timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
 /**
 * init_timer_key - initialize a timer
 * @timer: the timer to be initialized
- * @flags: timer flags
 * @name: name of the timer
 * @key: lockdep class key of the fake lock used for tracking timer
 *       sync lock dependencies
@@ -641,15 +590,26 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
 * init_timer_key() must be done to a timer prior calling *any* of the
 * other timer functions.
 */
-void init_timer_key(struct timer_list *timer, unsigned int flags,
+void init_timer_key(struct timer_list *timer,
-                    const char *name, struct lock_class_key *key)
+                    const char *name,
+                    struct lock_class_key *key)
 {
        debug_init(timer);
-        do_init_timer(timer, flags, name, key);
+        __init_timer(timer, name, key);
 }
 EXPORT_SYMBOL(init_timer_key);
-static inline void detach_timer(struct timer_list *timer, bool clear_pending)
+void init_timer_deferrable_key(struct timer_list *timer,
+                               const char *name,
+                               struct lock_class_key *key)
+{
+        init_timer_key(timer, name, key);
+        timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL(init_timer_deferrable_key);
+static inline void detach_timer(struct timer_list *timer,
+                                int clear_pending)
 {
        struct list_head *entry = &timer->entry;
@@ -661,29 +621,6 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)
        entry->prev = LIST_POISON2;
 }
-static inline void
-detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
-{
-        detach_timer(timer, true);
-        if (!tbase_get_deferrable(timer->base))
-                base->active_timers--;
-}
-static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
-                             bool clear_pending)
-{
-        if (!timer_pending(timer))
-                return 0;
-        detach_timer(timer, clear_pending);
-        if (!tbase_get_deferrable(timer->base)) {
-                base->active_timers--;
-                if (timer->expires == base->next_timer)
-                        base->next_timer = base->timer_jiffies;
-        }
-        return 1;
-}
 /*
 * We are using hashed locking: holding per_cpu(tvec_bases).lock
 * means that all timers which are tied to this base via timer->base are
@@ -729,9 +666,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
        base = lock_timer_base(timer, &flags);
-        ret = detach_if_pending(timer, base, false);
+        if (timer_pending(timer)) {
-        if (!ret && pending_only)
+                detach_timer(timer, 0);
-                goto out_unlock;
+                if (timer->expires == base->next_timer &&
+                    !tbase_get_deferrable(timer->base))
+                        base->next_timer = base->timer_jiffies;
+                ret = 1;
+        } else {
+                if (pending_only)
+                        goto out_unlock;
+        }
        debug_activate(timer, expires);
@@ -762,6 +706,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
        }
        timer->expires = expires;
+        if (time_before(timer->expires, base->next_timer) &&
+            !tbase_get_deferrable(timer->base))
+                base->next_timer = timer->expires;
        internal_add_timer(base, timer);
 out_unlock:
@@ -868,13 +815,7 @@ EXPORT_SYMBOL(mod_timer);
 *
 * mod_timer_pinned() is a way to update the expire field of an
 * active timer (if the timer is inactive it will be activated)
- * and to ensure that the timer is scheduled on the current CPU.
+ * and not allow the timer to be migrated to a different CPU.
- *
- * Note that this does not prevent the timer from being migrated
- * when the current CPU goes offline.  If this is a problem for
- * you, use CPU-hotplug notifiers to handle it correctly, for
- * example, cancelling the timer when the corresponding CPU goes
- * offline.
 *
 * mod_timer_pinned(timer, expires) is equivalent to:
 *
@@ -927,6 +868,9 @@ void add_timer_on(struct timer_list *timer, int cpu)
        spin_lock_irqsave(&base->lock, flags);
        timer_set_base(timer, base);
        debug_activate(timer, timer->expires);
+        if (time_before(timer->expires, base->next_timer) &&
+            !tbase_get_deferrable(timer->base))
+                base->next_timer = timer->expires;
        internal_add_timer(base, timer);
        /*
         * Check whether the other CPU is idle and needs to be
@@ -958,12 +902,16 @@ int del_timer(struct timer_list *timer)
        unsigned long flags;
        int ret = 0;
-        debug_assert_init(timer);
        timer_stats_timer_clear_start_info(timer);
        if (timer_pending(timer)) {
                base = lock_timer_base(timer, &flags);
-                ret = detach_if_pending(timer, base, true);
+                if (timer_pending(timer)) {
+                        detach_timer(timer, 1);
+                        if (timer->expires == base->next_timer &&
+                            !tbase_get_deferrable(timer->base))
+                                base->next_timer = base->timer_jiffies;
+                        ret = 1;
+                }
                spin_unlock_irqrestore(&base->lock, flags);
        }
@@ -984,14 +932,21 @@ int try_to_del_timer_sync(struct timer_list *timer)
        unsigned long flags;
        int ret = -1;
-        debug_assert_init(timer);
        base = lock_timer_base(timer, &flags);
-        if (base->running_timer != timer) {
+        if (base->running_timer == timer)
-                timer_stats_timer_clear_start_info(timer);
+                goto out;
-                ret = detach_if_pending(timer, base, true);
+        timer_stats_timer_clear_start_info(timer);
+        ret = 0;
+        if (timer_pending(timer)) {
+                detach_timer(timer, 1);
+                if (timer->expires == base->next_timer &&
+                    !tbase_get_deferrable(timer->base))
+                        base->next_timer = base->timer_jiffies;
+                ret = 1;
        }
+out:
        spin_unlock_irqrestore(&base->lock, flags);
        return ret;
@@ -1009,14 +964,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
 *
 * Synchronization rules: Callers must prevent restarting of the timer,
 * otherwise this function is meaningless. It must not be called from
- * interrupt contexts unless the timer is an irqsafe one. The caller must
+ * interrupt contexts. The caller must not hold locks which would prevent
- * not hold locks which would prevent completion of the timer's
+ * completion of the timer's handler. The timer's handler must not call
- * handler. The timer's handler must not call add_timer_on(). Upon exit the
+ * add_timer_on(). Upon exit the timer is not queued and the handler is
- * timer is not queued and the handler is not running on any CPU.
+ * not running on any CPU.
 *
- * Note: For !irqsafe timers, you must not hold locks that are held in
+ * Note: You must not hold locks that are held in interrupt context
- *   interrupt context while calling this function. Even if the lock has
+ *   while calling this function. Even if the lock has nothing to do
- *   nothing to do with the timer in question.  Here's why:
+ *   with the timer in question.  Here's why:
 *
 *    CPU0                             CPU1
 *    ----                             ----
@@ -1053,7 +1008,7 @@ int del_timer_sync(struct timer_list *timer)
         * don't use it in hardirq context, because it
         * could lead to deadlock.
         */
-        WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
+        WARN_ON(in_irq());
        for (;;) {
                int ret = try_to_del_timer_sync(timer);
                if (ret >= 0)
@@ -1078,8 +1033,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
         */
        list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
                BUG_ON(tbase_get_base(timer->base) != base);
-                /* No accounting, while moving them */
+                internal_add_timer(base, timer);
-                __internal_add_timer(base, timer);
        }
        return index;
@@ -1098,9 +1052,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
         * warnings as well as problems when looking into
         * timer->lockdep_map, make a copy and use that here.
         */
-        struct lockdep_map lockdep_map;
+        struct lockdep_map lockdep_map = timer->lockdep_map;
-        lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
 #endif
        /*
         * Couple the lock chain with the lock chain at
@@ -1160,27 +1112,19 @@ static inline void __run_timers(struct tvec_base *base)
                while (!list_empty(head)) {
                        void (*fn)(unsigned long);
                        unsigned long data;
-                        bool irqsafe;
                        timer = list_first_entry(head, struct timer_list,entry);
                        fn = timer->function;
                        data = timer->data;
-                        irqsafe = tbase_get_irqsafe(timer->base);
                        timer_stats_account_timer(timer);
                        base->running_timer = timer;
-                        detach_expired_timer(timer, base);
+                        detach_timer(timer, 1);
-                        if (irqsafe) {
+                        spin_unlock_irq(&base->lock);
-                                spin_unlock(&base->lock);
+                        call_timer_fn(timer, fn, data);
-                                call_timer_fn(timer, fn, data);
+                        spin_lock_irq(&base->lock);
-                                spin_lock(&base->lock);
-                        } else {
-                                spin_unlock_irq(&base->lock);
-                                call_timer_fn(timer, fn, data);
-                                spin_lock_irq(&base->lock);
-                        }
                }
        }
        base->running_timer = NULL;
@@ -1314,21 +1258,18 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
 unsigned long get_next_timer_interrupt(unsigned long now)
 {
        struct tvec_base *base = __this_cpu_read(tvec_bases);
-        unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
+        unsigned long expires;
        /*
         * Pretend that there is no timer pending if the cpu is offline.
         * Possible pending timers will be migrated later to an active cpu.
         */
        if (cpu_is_offline(smp_processor_id()))
-                return expires;
+                return now + NEXT_TIMER_MAX_DELTA;
        spin_lock(&base->lock);
-        if (base->active_timers) {
+        if (time_before_eq(base->next_timer, base->timer_jiffies))
-                if (time_before_eq(base->next_timer, base->timer_jiffies))
+                base->next_timer = __next_timer_interrupt(base);
-                        base->next_timer = __next_timer_interrupt(base);
+        expires = base->next_timer;
-                expires = base->next_timer;
-        }
        spin_unlock(&base->lock);
        if (time_before_eq(expires, now))
@@ -1395,6 +1336,13 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
 #endif
+#ifndef __alpha__
+/*
+ * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
+ * should be moved into arch/i386 instead?
+ */
 /**
 * sys_getpid - return the thread group id of the current process
 *
@@ -1420,7 +1368,7 @@ SYSCALL_DEFINE0(getppid)
        int pid;
        rcu_read_lock();
-        pid = task_tgid_vnr(rcu_dereference(current->real_parent));
+        pid = task_tgid_vnr(current->real_parent);
        rcu_read_unlock();
        return pid;
@@ -1429,27 +1377,29 @@ SYSCALL_DEFINE0(getppid)
 SYSCALL_DEFINE0(getuid)
 {
        /* Only we change this so SMP safe */
-        return from_kuid_munged(current_user_ns(), current_uid());
+        return current_uid();
 }
 SYSCALL_DEFINE0(geteuid)
 {
        /* Only we change this so SMP safe */
-        return from_kuid_munged(current_user_ns(), current_euid());
+        return current_euid();
 }
 SYSCALL_DEFINE0(getgid)
 {
        /* Only we change this so SMP safe */
-        return from_kgid_munged(current_user_ns(), current_gid());
+        return current_gid();
 }
 SYSCALL_DEFINE0(getegid)
 {
        /* Only we change this so SMP safe */
-        return from_kgid_munged(current_user_ns(), current_egid());
+        return  current_egid();
 }
+#endif
 static void process_timeout(unsigned long __data)
 {
        wake_up_process((struct task_struct *)__data);
@@ -1696,7 +1646,6 @@ static int __cpuinit init_timers_cpu(int cpu)
        base->timer_jiffies = jiffies;
        base->next_timer = base->timer_jiffies;
-        base->active_timers = 0;
        return 0;
 }
@@ -1707,9 +1656,11 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
        while (!list_empty(head)) {
                timer = list_first_entry(head, struct timer_list, entry);
-                /* We ignore the accounting on the dying cpu */
+                detach_timer(timer, 0);
-                detach_timer(timer, false);
                timer_set_base(timer, new_base);
+                if (time_before(timer->expires, new_base->next_timer) &&
+                    !tbase_get_deferrable(timer->base))
+                        new_base->next_timer = timer->expires;
                internal_add_timer(new_base, timer);
        }
 }
@@ -1779,13 +1730,9 @@ static struct notifier_block __cpuinitdata timers_nb = {
 void __init init_timers(void)
 {
-        int err;
+        int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+                                (void *)(long)smp_processor_id());
-        /* ensure there are enough low bits for flags in timer->base pointer */
-        BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
-        err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
-                               (void *)(long)smp_processor_id());
        init_timer_stats();
        BUG_ON(err != NOTIFY_OK);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d89335a485..93168c0f991 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,11 +49,6 @@ config HAVE_SYSCALL_TRACEPOINTS
        help
          See Documentation/trace/ftrace-design.txt
-config HAVE_FENTRY
-        bool
-        help
-          Arch supports the gcc options -pg with -mfentry
 config HAVE_C_RECORDMCOUNT
        bool
        help
@@ -62,12 +57,8 @@ config HAVE_C_RECORDMCOUNT
 config TRACER_MAX_TRACE
        bool
-config TRACE_CLOCK
-        bool
 config RING_BUFFER
        bool
-        select TRACE_CLOCK
 config FTRACE_NMI_ENTER
       bool
@@ -118,8 +109,6 @@ config TRACING
        select NOP_TRACER
        select BINARY_PRINTF
        select EVENT_TRACING
-        select TRACE_CLOCK
-        select IRQ_WORK
 config GENERIC_TRACER
        bool
@@ -152,6 +141,7 @@ if FTRACE
 config FUNCTION_TRACER
        bool "Kernel Function Tracer"
        depends on HAVE_FUNCTION_TRACER
+        select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
        select KALLSYMS
        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
@@ -282,7 +272,7 @@ config PROFILE_ANNOTATED_BRANCHES
        bool "Trace likely/unlikely profiler"
        select TRACE_BRANCH_PROFILING
        help
-          This tracer profiles all likely and unlikely macros
+          This tracer profiles all the the likely and unlikely macros
          in the kernel. It will display the results in:
          /sys/kernel/debug/tracing/trace_stat/branch_annotated
@@ -383,7 +373,6 @@ config KPROBE_EVENT
        depends on HAVE_REGS_AND_STACK_ACCESS_API
        bool "Enable kprobes-based dynamic events"
        select TRACING
-        select PROBE_EVENTS
        default y
        help
          This allows the user to add tracing events (similar to tracepoints)
@@ -396,25 +385,6 @@ config KPROBE_EVENT
          This option is also required by perf-probe subcommand of perf tools.
          If you want to use perf tools, this option is strongly recommended.
-config UPROBE_EVENT
-        bool "Enable uprobes-based dynamic events"
-        depends on ARCH_SUPPORTS_UPROBES
-        depends on MMU
-        select UPROBES
-        select PROBE_EVENTS
-        select TRACING
-        default n
-        help
-          This allows the user to add tracing events on top of userspace
-          dynamic events (similar to tracepoints) on the fly via the trace
-          events interface. Those events can be inserted wherever uprobes
-          can probe, and record various registers.
-          This option is required if you plan to use perf-probe subcommand
-          of perf tools on user space applications.
-config PROBE_EVENTS
-        def_bool n
 config DYNAMIC_FTRACE
        bool "enable/disable ftrace tracepoints dynamically"
        depends on FUNCTION_TRACER
@@ -517,6 +487,39 @@ config RING_BUFFER_BENCHMARK
          If unsure, say N.
+config TRACELEVEL
+        bool "Add capability to prioritize traces"
+        depends on EVENT_TRACING
+        help
+          This option allows subsystem programmers to add priorities to trace
+          events by calling to tracelevel_register. Traces of high priority
+          will automatically be enabled on kernel boot, and users can change
+          the the trace level in a kernel parameter.
+config TRACEDUMP
+        bool "Dumping functionality for ftrace"
+        depends on FUNCTION_TRACER
+        help
+          This option adds functionality to dump tracing data in several forms
+          Data can be dumped in ascii form or as raw pages from the tracing
+          ring buffers, along with the saved cmdlines. This is specified by
+          the module parameter tracedump_ascii. Data will be compressed
+          using zlib.
+config TRACEDUMP_PANIC
+        bool "Tracedump to console on panic"
+        depends on TRACEDUMP
+        help
+          With this option, tracedump will automatically dump to the console
+          on a kernel panic.
+config TRACEDUMP_PROCFS
+        bool "Tracedump via proc file"
+        depends on TRACEDUMP
+        help
+          With this option, tracedump can be dumped from user space by reading
+          from /proc/tracedump.
 endif # FTRACE
 endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d7e2068e4b7..1360a1a90d5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -5,21 +5,21 @@ ifdef CONFIG_FUNCTION_TRACER
 ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
-ifdef CONFIG_FTRACE_SELFTEST
 # selftest needs instrumentation
 CFLAGS_trace_selftest_dynamic.o = -pg
 obj-y += trace_selftest_dynamic.o
 endif
-endif
 # If unlikely tracing is enabled, do not trace these files
 ifdef CONFIG_TRACING_BRANCHES
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
-CFLAGS_trace_events_filter.o := -I$(src)
+#
+# Make the trace clocks available generally: it's infrastructure
-obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o
+# relied on by ptrace for example:
+#
+obj-y += trace_clock.o
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
@@ -39,6 +39,7 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
+obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-$(CONFIG_EVENT_TRACING) += blktrace.o
@@ -52,13 +53,10 @@ endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
-ifeq ($(CONFIG_PM_RUNTIME),y)
-obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
-endif
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
-obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
+obj-$(CONFIG_TRACELEVEL) += tracelevel.o
-obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
+obj-$(CONFIG_TRACEDUMP) += tracedump.o
 libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c0bd0308741..7c910a5593a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,7 +23,6 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
-#include <linux/export.h>
 #include <linux/time.h>
 #include <linux/uaccess.h>
@@ -311,6 +310,13 @@ int blk_trace_remove(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_trace_remove);
+static int blk_dropped_open(struct inode *inode, struct file *filp)
+{
+        filp->private_data = inode->i_private;
+        return 0;
+}
 static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
                                size_t count, loff_t *ppos)
 {
@@ -324,11 +330,18 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
 static const struct file_operations blk_dropped_fops = {
        .owner =        THIS_MODULE,
-        .open =         simple_open,
+        .open =         blk_dropped_open,
        .read =         blk_dropped_read,
        .llseek =       default_llseek,
 };
+static int blk_msg_open(struct inode *inode, struct file *filp)
+{
+        filp->private_data = inode->i_private;
+        return 0;
+}
 static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
                                size_t count, loff_t *ppos)
 {
@@ -357,7 +370,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 static const struct file_operations blk_msg_fops = {
        .owner =        THIS_MODULE,
-        .open =         simple_open,
+        .open =         blk_msg_open,
        .write =        blk_msg_write,
        .llseek =       noop_llseek,
 };
@@ -388,7 +401,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry)
 static struct dentry *blk_create_buf_file_callback(const char *filename,
                                                   struct dentry *parent,
-                                                   umode_t mode,
+                                                   int mode,
                                                   struct rchan_buf *buf,
                                                   int *is_global)
 {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3ffe4c5ad3f..798b16cd40f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -10,7 +10,7 @@
 * Based on code in the latency_tracer, that is:
 *
 *  Copyright (C) 2004-2006 Ingo Molnar
- *  Copyright (C) 2004 Nadia Yvette Chambers
+ *  Copyright (C) 2004 William Lee Irwin III
 */
 #include <linux/stop_machine.h>
@@ -22,13 +22,10 @@
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
-#include <linux/bsearch.h>
-#include <linux/module.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
-#include <linux/sort.h>
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/rcupdate.h>
@@ -62,22 +59,12 @@
 #define FTRACE_HASH_DEFAULT_BITS 10
 #define FTRACE_HASH_MAX_BITS 12
-#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
-static struct ftrace_ops ftrace_list_end __read_mostly = {
-        .func           = ftrace_stub,
-        .flags          = FTRACE_OPS_FL_RECURSION_SAFE,
-};
 /* ftrace_enabled is a method to turn ftrace on or off */
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 /* Quick disabling of function tracer. */
-int function_trace_stop __read_mostly;
+int function_trace_stop;
-/* Current function tracing op */
-struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
 /* List for set_ftrace_pid's pids. */
 LIST_HEAD(ftrace_pids);
@@ -94,43 +81,20 @@ static int ftrace_disabled __read_mostly;
 static DEFINE_MUTEX(ftrace_lock);
+static struct ftrace_ops ftrace_list_end __read_mostly = {
+        .func           = ftrace_stub,
+};
 static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
-static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
 static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
+ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
 static struct ftrace_ops global_ops;
-static struct ftrace_ops control_ops;
-#if ARCH_SUPPORTS_FTRACE_OPS
-static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
-                                 struct ftrace_ops *op, struct pt_regs *regs);
-#else
-/* See comment below, where ftrace_ops_list_func is defined */
-static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
-#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
-#endif
-/**
- * ftrace_nr_registered_ops - return number of ops registered
- *
- * Returns the number of ftrace_ops registered and tracing functions
- */
-int ftrace_nr_registered_ops(void)
-{
-        struct ftrace_ops *ops;
-        int cnt = 0;
-        mutex_lock(&ftrace_lock);
-        for (ops = ftrace_ops_list;
+static void
-             ops != &ftrace_list_end; ops = ops->next)
+ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
-                cnt++;
-        mutex_unlock(&ftrace_lock);
-        return cnt;
-}
 /*
 * Traverse the ftrace_global_list, invoking all entries.  The reason that we
@@ -141,29 +105,29 @@ int ftrace_nr_registered_ops(void)
 *
 * Silly Alpha and silly pointer-speculation compiler optimizations!
 */
-static void
+static void ftrace_global_list_func(unsigned long ip,
-ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
+                                    unsigned long parent_ip)
-                        struct ftrace_ops *op, struct pt_regs *regs)
 {
+        struct ftrace_ops *op;
        if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
                return;
        trace_recursion_set(TRACE_GLOBAL_BIT);
        op = rcu_dereference_raw(ftrace_global_list); /*see above*/
        while (op != &ftrace_list_end) {
-                op->func(ip, parent_ip, op, regs);
+                op->func(ip, parent_ip);
                op = rcu_dereference_raw(op->next); /*see above*/
        };
        trace_recursion_clear(TRACE_GLOBAL_BIT);
 }
-static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
+static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
-                            struct ftrace_ops *op, struct pt_regs *regs)
 {
        if (!test_tsk_trace_trace(current))
                return;
-        ftrace_pid_function(ip, parent_ip, op, regs);
+        ftrace_pid_function(ip, parent_ip);
 }
 static void set_ftrace_pid_function(ftrace_func_t func)
@@ -182,34 +146,24 @@ static void set_ftrace_pid_function(ftrace_func_t func)
 void clear_ftrace_function(void)
 {
        ftrace_trace_function = ftrace_stub;
+        __ftrace_trace_function = ftrace_stub;
+        __ftrace_trace_function_delay = ftrace_stub;
        ftrace_pid_function = ftrace_stub;
 }
-static void control_ops_disable_all(struct ftrace_ops *ops)
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-{
+/*
-        int cpu;
+ * For those archs that do not test ftrace_trace_stop in their
+ * mcount call site, we need to do it from C.
-        for_each_possible_cpu(cpu)
+ */
-                *per_cpu_ptr(ops->disabled, cpu) = 1;
+static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
-}
-static int control_ops_alloc(struct ftrace_ops *ops)
 {
-        int __percpu *disabled;
+        if (function_trace_stop)
+                return;
-        disabled = alloc_percpu(int);
-        if (!disabled)
-                return -ENOMEM;
-        ops->disabled = disabled;
+        __ftrace_trace_function(ip, parent_ip);
-        control_ops_disable_all(ops);
-        return 0;
-}
-static void control_ops_free(struct ftrace_ops *ops)
-{
-        free_percpu(ops->disabled);
 }
+#endif
 static void update_global_ops(void)
 {
@@ -243,27 +197,27 @@ static void update_ftrace_function(void)
        /*
         * If we are at the end of the list and this ops is
-         * recursion safe and not dynamic and the arch supports passing ops,
+         * not dynamic, then have the mcount trampoline call
-         * then have the mcount trampoline call the function directly.
+         * the function directly
         */
        if (ftrace_ops_list == &ftrace_list_end ||
            (ftrace_ops_list->next == &ftrace_list_end &&
-             !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) &&
+             !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
-             (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
-             !FTRACE_FORCE_LIST_FUNC)) {
-                /* Set the ftrace_ops that the arch callback uses */
-                if (ftrace_ops_list == &global_ops)
-                        function_trace_op = ftrace_global_list;
-                else
-                        function_trace_op = ftrace_ops_list;
                func = ftrace_ops_list->func;
-        } else {
+        else
-                /* Just use the default ftrace_ops */
-                function_trace_op = &ftrace_list_end;
                func = ftrace_ops_list_func;
-        }
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
        ftrace_trace_function = func;
+#else
+#ifdef CONFIG_DYNAMIC_FTRACE
+        /* do not update till all functions have been modified */
+        __ftrace_trace_function_delay = func;
+#else
+        __ftrace_trace_function = func;
+#endif
+        ftrace_trace_function = ftrace_test_stop_func;
+#endif
 }
 static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
@@ -302,29 +256,9 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
        return 0;
 }
-static void add_ftrace_list_ops(struct ftrace_ops **list,
-                                struct ftrace_ops *main_ops,
-                                struct ftrace_ops *ops)
-{
-        int first = *list == &ftrace_list_end;
-        add_ftrace_ops(list, ops);
-        if (first)
-                add_ftrace_ops(&ftrace_ops_list, main_ops);
-}
-static int remove_ftrace_list_ops(struct ftrace_ops **list,
-                                  struct ftrace_ops *main_ops,
-                                  struct ftrace_ops *ops)
-{
-        int ret = remove_ftrace_ops(list, ops);
-        if (!ret && *list == &ftrace_list_end)
-                ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
-        return ret;
-}
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-        if (unlikely(ftrace_disabled))
+        if (ftrace_disabled)
                return -ENODEV;
        if (FTRACE_WARN_ON(ops == &global_ops))
@@ -333,34 +267,15 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
        if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
                return -EBUSY;
-        /* We don't support both control and global flags set. */
-        if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
-                return -EINVAL;
-#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS
-        /*
-         * If the ftrace_ops specifies SAVE_REGS, then it only can be used
-         * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
-         * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant.
-         */
-        if (ops->flags & FTRACE_OPS_FL_SAVE_REGS &&
-            !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED))
-                return -EINVAL;
-        if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)
-                ops->flags |= FTRACE_OPS_FL_SAVE_REGS;
-#endif
        if (!core_kernel_data((unsigned long)ops))
                ops->flags |= FTRACE_OPS_FL_DYNAMIC;
        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-                add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
+                int first = ftrace_global_list == &ftrace_list_end;
+                add_ftrace_ops(&ftrace_global_list, ops);
                ops->flags |= FTRACE_OPS_FL_ENABLED;
-        } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
+                if (first)
-                if (control_ops_alloc(ops))
+                        add_ftrace_ops(&ftrace_ops_list, &global_ops);
-                        return -ENOMEM;
-                add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
        } else
                add_ftrace_ops(&ftrace_ops_list, ops);
@@ -384,23 +299,11 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
                return -EINVAL;
        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-                ret = remove_ftrace_list_ops(&ftrace_global_list,
+                ret = remove_ftrace_ops(&ftrace_global_list, ops);
-                                             &global_ops, ops);
+                if (!ret && ftrace_global_list == &ftrace_list_end)
+                        ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
                if (!ret)
                        ops->flags &= ~FTRACE_OPS_FL_ENABLED;
-        } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
-                ret = remove_ftrace_list_ops(&ftrace_control_list,
-                                             &control_ops, ops);
-                if (!ret) {
-                        /*
-                         * The ftrace_ops is now removed from the list,
-                         * so there'll be no new users. We must ensure
-                         * all current users are done before we free
-                         * the control data.
-                         */
-                        synchronize_sched();
-                        control_ops_free(ops);
-                }
        } else
                ret = remove_ftrace_ops(&ftrace_ops_list, ops);
@@ -799,8 +702,7 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
 }
 static void
-function_profile_call(unsigned long ip, unsigned long parent_ip,
+function_profile_call(unsigned long ip, unsigned long parent_ip)
-                      struct ftrace_ops *ops, struct pt_regs *regs)
 {
        struct ftrace_profile_stat *stat;
        struct ftrace_profile *rec;
@@ -830,7 +732,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static int profile_graph_entry(struct ftrace_graph_ent *trace)
 {
-        function_profile_call(trace->func, 0, NULL, NULL);
+        function_profile_call(trace->func, 0);
        return 1;
 }
@@ -890,7 +792,6 @@ static void unregister_ftrace_profiler(void)
 #else
 static struct ftrace_ops ftrace_profile_ops __read_mostly = {
        .func           = function_profile_call,
-        .flags          = FTRACE_OPS_FL_RECURSION_SAFE,
 };
 static int register_ftrace_profiler(void)
@@ -1045,6 +946,13 @@ struct ftrace_func_probe {
        struct rcu_head         rcu;
 };
+enum {
+        FTRACE_UPDATE_CALLS             = (1 << 0),
+        FTRACE_DISABLE_CALLS            = (1 << 1),
+        FTRACE_UPDATE_TRACE_FUNC        = (1 << 2),
+        FTRACE_START_FUNC_RET           = (1 << 3),
+        FTRACE_STOP_FUNC_RET            = (1 << 4),
+};
 struct ftrace_func_entry {
        struct hlist_node hlist;
        unsigned long ip;
@@ -1073,22 +981,20 @@ static struct ftrace_ops global_ops = {
        .func                   = ftrace_stub,
        .notrace_hash           = EMPTY_HASH,
        .filter_hash            = EMPTY_HASH,
-        .flags                  = FTRACE_OPS_FL_RECURSION_SAFE,
 };
+static struct dyn_ftrace *ftrace_new_addrs;
 static DEFINE_MUTEX(ftrace_regex_lock);
 struct ftrace_page {
        struct ftrace_page      *next;
-        struct dyn_ftrace       *records;
        int                     index;
-        int                     size;
+        struct dyn_ftrace       records[];
 };
-static struct ftrace_page *ftrace_new_pgs;
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
-#define ENTRY_SIZE sizeof(struct dyn_ftrace)
-#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
 /* estimate from running different kernels */
 #define NR_TO_INIT              10000
@@ -1096,10 +1002,7 @@ static struct ftrace_page *ftrace_new_pgs;
 static struct ftrace_page       *ftrace_pages_start;
 static struct ftrace_page       *ftrace_pages;
-static bool ftrace_hash_empty(struct ftrace_hash *hash)
+static struct dyn_ftrace *ftrace_free_records;
-{
-        return !hash || !hash->count;
-}
 static struct ftrace_func_entry *
 ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
@@ -1109,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
        struct hlist_head *hhd;
        struct hlist_node *n;
-        if (ftrace_hash_empty(hash))
+        if (!hash->count)
                return NULL;
        if (hash->size_bits > 0)
@@ -1216,12 +1119,6 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
        call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
 }
-void ftrace_free_filter(struct ftrace_ops *ops)
-{
-        free_ftrace_hash(ops->filter_hash);
-        free_ftrace_hash(ops->notrace_hash);
-}
 static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
 {
        struct ftrace_hash *hash;
@@ -1232,7 +1129,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
                return NULL;
        size = 1 << size_bits;
-        hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL);
+        hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL);
        if (!hash->buckets) {
                kfree(hash);
@@ -1259,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
                return NULL;
        /* Empty hash? */
-        if (ftrace_hash_empty(hash))
+        if (!hash || !hash->count)
                return new_hash;
        size = 1 << hash->size_bits;
@@ -1313,9 +1210,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
        if (!src->count) {
                free_ftrace_hash_rcu(*dst);
                rcu_assign_pointer(*dst, EMPTY_HASH);
-                /* still need to update the function records */
+                return 0;
-                ret = 0;
-                goto out;
        }
        /*
@@ -1384,9 +1279,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
        filter_hash = rcu_dereference_raw(ops->filter_hash);
        notrace_hash = rcu_dereference_raw(ops->notrace_hash);
-        if ((ftrace_hash_empty(filter_hash) ||
+        if ((!filter_hash || !filter_hash->count ||
             ftrace_lookup_ip(filter_hash, ip)) &&
-            (ftrace_hash_empty(notrace_hash) ||
+            (!notrace_hash || !notrace_hash->count ||
             !ftrace_lookup_ip(notrace_hash, ip)))
                ret = 1;
        else
@@ -1409,76 +1304,6 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
                }                               \
        }
-static int ftrace_cmp_recs(const void *a, const void *b)
-{
-        const struct dyn_ftrace *key = a;
-        const struct dyn_ftrace *rec = b;
-        if (key->flags < rec->ip)
-                return -1;
-        if (key->ip >= rec->ip + MCOUNT_INSN_SIZE)
-                return 1;
-        return 0;
-}
-static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
-{
-        struct ftrace_page *pg;
-        struct dyn_ftrace *rec;
-        struct dyn_ftrace key;
-        key.ip = start;
-        key.flags = end;        /* overload flags, as it is unsigned long */
-        for (pg = ftrace_pages_start; pg; pg = pg->next) {
-                if (end < pg->records[0].ip ||
-                    start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
-                        continue;
-                rec = bsearch(&key, pg->records, pg->index,
-                              sizeof(struct dyn_ftrace),
-                              ftrace_cmp_recs);
-                if (rec)
-                        return rec->ip;
-        }
-        return 0;
-}
-/**
- * ftrace_location - return true if the ip giving is a traced location
- * @ip: the instruction pointer to check
- *
- * Returns rec->ip if @ip given is a pointer to a ftrace location.
- * That is, the instruction that is either a NOP or call to
- * the function tracer. It checks the ftrace internal tables to
- * determine if the address belongs or not.
- */
-unsigned long ftrace_location(unsigned long ip)
-{
-        return ftrace_location_range(ip, ip);
-}
-/**
- * ftrace_text_reserved - return true if range contains an ftrace location
- * @start: start of range to search
- * @end: end of range to search (inclusive). @end points to the last byte to check.
- *
- * Returns 1 if @start and @end contains a ftrace location.
- * That is, the instruction that is either a NOP or call to
- * the function tracer. It checks the ftrace internal tables to
- * determine if the address belongs or not.
- */
-int ftrace_text_reserved(void *start, void *end)
-{
-        unsigned long ret;
-        ret = ftrace_location_range((unsigned long)start,
-                                    (unsigned long)end);
-        return (int)!!ret;
-}
 static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                                     int filter_hash,
                                     bool inc)
@@ -1508,7 +1333,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
        if (filter_hash) {
                hash = ops->filter_hash;
                other_hash = ops->notrace_hash;
-                if (ftrace_hash_empty(hash))
+                if (!hash || !hash->count)
                        all = 1;
        } else {
                inc = !inc;
@@ -1518,7 +1343,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                 * If the notrace hash has no items,
                 * then there's nothing to do.
                 */
-                if (ftrace_hash_empty(hash))
+                if (hash && !hash->count)
                        return;
        }
@@ -1535,8 +1360,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                        if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
                                match = 1;
                } else {
-                        in_hash = !!ftrace_lookup_ip(hash, rec->ip);
+                        in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
-                        in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
+                        in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
                        /*
                         *
@@ -1544,7 +1369,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                        if (filter_hash && in_hash && !in_other_hash)
                                match = 1;
                        else if (!filter_hash && in_hash &&
-                                 (in_other_hash || ftrace_hash_empty(other_hash)))
+                                 (in_other_hash || !other_hash->count))
                                match = 1;
                }
                if (!match)
@@ -1554,12 +1379,6 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                        rec->flags++;
                        if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
                                return;
-                        /*
-                         * If any ops wants regs saved for this function
-                         * then all ops will get saved regs.
-                         */
-                        if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
-                                rec->flags |= FTRACE_FL_REGS;
                } else {
                        if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
                                return;
@@ -1584,6 +1403,65 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
        __ftrace_hash_rec_update(ops, filter_hash, 1);
 }
+static void ftrace_free_rec(struct dyn_ftrace *rec)
+{
+        rec->freelist = ftrace_free_records;
+        ftrace_free_records = rec;
+        rec->flags |= FTRACE_FL_FREE;
+}
+static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
+{
+        struct dyn_ftrace *rec;
+        /* First check for freed records */
+        if (ftrace_free_records) {
+                rec = ftrace_free_records;
+                if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
+                        FTRACE_WARN_ON_ONCE(1);
+                        ftrace_free_records = NULL;
+                        return NULL;
+                }
+                ftrace_free_records = rec->freelist;
+                memset(rec, 0, sizeof(*rec));
+                return rec;
+        }
+        if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+                if (!ftrace_pages->next) {
+                        /* allocate another page */
+                        ftrace_pages->next =
+                                (void *)get_zeroed_page(GFP_KERNEL);
+                        if (!ftrace_pages->next)
+                                return NULL;
+                }
+                ftrace_pages = ftrace_pages->next;
+        }
+        return &ftrace_pages->records[ftrace_pages->index++];
+}
+static struct dyn_ftrace *
+ftrace_record_ip(unsigned long ip)
+{
+        struct dyn_ftrace *rec;
+        if (ftrace_disabled)
+                return NULL;
+        rec = ftrace_alloc_dyn_node(ip);
+        if (!rec)
+                return NULL;
+        rec->ip = ip;
+        rec->newlist = ftrace_new_addrs;
+        ftrace_new_addrs = rec;
+        return rec;
+}
 static void print_ip_ins(const char *fmt, unsigned char *p)
 {
        int i;
@@ -1594,19 +1472,7 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
                printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
 }
-/**
+static void ftrace_bug(int failed, unsigned long ip)
- * ftrace_bug - report and shutdown function tracer
- * @failed: The failed type (EFAULT, EINVAL, EPERM)
- * @ip: The address that failed
- *
- * The arch code that enables or disables the function tracing
- * can call ftrace_bug() when it has detected a problem in
- * modifying the code. @failed should be one of either:
- * EFAULT - if the problem happens on reading the @ip address
- * EINVAL - if what is read at @ip is not what was expected
- * EPERM - if the problem happens on writting to the @ip address
- */
-void ftrace_bug(int failed, unsigned long ip)
 {
        switch (failed) {
        case -EFAULT:
@@ -1633,10 +1499,30 @@ void ftrace_bug(int failed, unsigned long ip)
        }
 }
-static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
+/* Return 1 if the address range is reserved for ftrace */
+int ftrace_text_reserved(void *start, void *end)
+{
+        struct dyn_ftrace *rec;
+        struct ftrace_page *pg;
+        do_for_each_ftrace_rec(pg, rec) {
+                if (rec->ip <= (unsigned long)end &&
+                    rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
+                        return 1;
+        } while_for_each_ftrace_rec();
+        return 0;
+}
+static int
+__ftrace_replace_code(struct dyn_ftrace *rec, int update)
 {
+        unsigned long ftrace_addr;
        unsigned long flag = 0UL;
+        ftrace_addr = (unsigned long)FTRACE_ADDR;
        /*
         * If we are updating calls:
         *
@@ -1648,131 +1534,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
         * If we are disabling calls, then disable all records that
         * are enabled.
         */
-        if (enable && (rec->flags & ~FTRACE_FL_MASK))
+        if (update && (rec->flags & ~FTRACE_FL_MASK))
                flag = FTRACE_FL_ENABLED;
-        /*
-         * If enabling and the REGS flag does not match the REGS_EN, then
-         * do not ignore this record. Set flags to fail the compare against
-         * ENABLED.
-         */
-        if (flag &&
-            (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))
-                flag |= FTRACE_FL_REGS;
        /* If the state of this record hasn't changed, then do nothing */
        if ((rec->flags & FTRACE_FL_ENABLED) == flag)
-                return FTRACE_UPDATE_IGNORE;
-        if (flag) {
-                /* Save off if rec is being enabled (for return value) */
-                flag ^= rec->flags & FTRACE_FL_ENABLED;
-                if (update) {
-                        rec->flags |= FTRACE_FL_ENABLED;
-                        if (flag & FTRACE_FL_REGS) {
-                                if (rec->flags & FTRACE_FL_REGS)
-                                        rec->flags |= FTRACE_FL_REGS_EN;
-                                else
-                                        rec->flags &= ~FTRACE_FL_REGS_EN;
-                        }
-                }
-                /*
-                 * If this record is being updated from a nop, then
-                 *   return UPDATE_MAKE_CALL.
-                 * Otherwise, if the EN flag is set, then return
-                 *   UPDATE_MODIFY_CALL_REGS to tell the caller to convert
-                 *   from the non-save regs, to a save regs function.
-                 * Otherwise,
-                 *   return UPDATE_MODIFY_CALL to tell the caller to convert
-                 *   from the save regs, to a non-save regs function.
-                 */
-                if (flag & FTRACE_FL_ENABLED)
-                        return FTRACE_UPDATE_MAKE_CALL;
-                else if (rec->flags & FTRACE_FL_REGS_EN)
-                        return FTRACE_UPDATE_MODIFY_CALL_REGS;
-                else
-                        return FTRACE_UPDATE_MODIFY_CALL;
-        }
-        if (update) {
-                /* If there's no more users, clear all flags */
-                if (!(rec->flags & ~FTRACE_FL_MASK))
-                        rec->flags = 0;
-                else
-                        /* Just disable the record (keep REGS state) */
-                        rec->flags &= ~FTRACE_FL_ENABLED;
-        }
-        return FTRACE_UPDATE_MAKE_NOP;
-}
-/**
- * ftrace_update_record, set a record that now is tracing or not
- * @rec: the record to update
- * @enable: set to 1 if the record is tracing, zero to force disable
- *
- * The records that represent all functions that can be traced need
- * to be updated when tracing has been enabled.
- */
-int ftrace_update_record(struct dyn_ftrace *rec, int enable)
-{
-        return ftrace_check_record(rec, enable, 1);
-}
-/**
- * ftrace_test_record, check if the record has been enabled or not
- * @rec: the record to test
- * @enable: set to 1 to check if enabled, 0 if it is disabled
- *
- * The arch code may need to test if a record is already set to
- * tracing to determine how to modify the function code that it
- * represents.
- */
-int ftrace_test_record(struct dyn_ftrace *rec, int enable)
-{
-        return ftrace_check_record(rec, enable, 0);
-}
-static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
-{
-        unsigned long ftrace_old_addr;
-        unsigned long ftrace_addr;
-        int ret;
-        ret = ftrace_update_record(rec, enable);
-        if (rec->flags & FTRACE_FL_REGS)
-                ftrace_addr = (unsigned long)FTRACE_REGS_ADDR;
-        else
-                ftrace_addr = (unsigned long)FTRACE_ADDR;
-        switch (ret) {
-        case FTRACE_UPDATE_IGNORE:
                return 0;
-        case FTRACE_UPDATE_MAKE_CALL:
+        if (flag) {
+                rec->flags |= FTRACE_FL_ENABLED;
                return ftrace_make_call(rec, ftrace_addr);
-        case FTRACE_UPDATE_MAKE_NOP:
-                return ftrace_make_nop(NULL, rec, ftrace_addr);
-        case FTRACE_UPDATE_MODIFY_CALL_REGS:
-        case FTRACE_UPDATE_MODIFY_CALL:
-                if (rec->flags & FTRACE_FL_REGS)
-                        ftrace_old_addr = (unsigned long)FTRACE_ADDR;
-                else
-                        ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;
-                return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
        }
-        return -1; /* unknow ftrace bug */
+        rec->flags &= ~FTRACE_FL_ENABLED;
+        return ftrace_make_nop(NULL, rec, ftrace_addr);
 }
-void __weak ftrace_replace_code(int enable)
+static void ftrace_replace_code(int update)
 {
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
@@ -1782,7 +1560,11 @@ void __weak ftrace_replace_code(int enable)
                return;
        do_for_each_ftrace_rec(pg, rec) {
-                failed = __ftrace_replace_code(rec, enable);
+                /* Skip over free records */
+                if (rec->flags & FTRACE_FL_FREE)
+                        continue;
+                failed = __ftrace_replace_code(rec, update);
                if (failed) {
                        ftrace_bug(failed, rec->ip);
                        /* Stop processing */
@@ -1791,78 +1573,6 @@ void __weak ftrace_replace_code(int enable)
        } while_for_each_ftrace_rec();
 }
-struct ftrace_rec_iter {
-        struct ftrace_page      *pg;
-        int                     index;
-};
-/**
- * ftrace_rec_iter_start, start up iterating over traced functions
- *
- * Returns an iterator handle that is used to iterate over all
- * the records that represent address locations where functions
- * are traced.
- *
- * May return NULL if no records are available.
- */
-struct ftrace_rec_iter *ftrace_rec_iter_start(void)
-{
-        /*
-         * We only use a single iterator.
-         * Protected by the ftrace_lock mutex.
-         */
-        static struct ftrace_rec_iter ftrace_rec_iter;
-        struct ftrace_rec_iter *iter = &ftrace_rec_iter;
-        iter->pg = ftrace_pages_start;
-        iter->index = 0;
-        /* Could have empty pages */
-        while (iter->pg && !iter->pg->index)
-                iter->pg = iter->pg->next;
-        if (!iter->pg)
-                return NULL;
-        return iter;
-}
-/**
- * ftrace_rec_iter_next, get the next record to process.
- * @iter: The handle to the iterator.
- *
- * Returns the next iterator after the given iterator @iter.
- */
-struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
-{
-        iter->index++;
-        if (iter->index >= iter->pg->index) {
-                iter->pg = iter->pg->next;
-                iter->index = 0;
-                /* Could have empty pages */
-                while (iter->pg && !iter->pg->index)
-                        iter->pg = iter->pg->next;
-        }
-        if (!iter->pg)
-                return NULL;
-        return iter;
-}
-/**
- * ftrace_rec_iter_record, get the record at the iterator location
- * @iter: The current iterator location
- *
- * Returns the record that the current @iter is at.
- */
-struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
-{
-        return &iter->pg->records[iter->index];
-}
 static int
 ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 {
@@ -1900,55 +1610,44 @@ int __weak ftrace_arch_code_modify_post_process(void)
        return 0;
 }
-void ftrace_modify_all_code(int command)
+static int __ftrace_modify_code(void *data)
 {
-        if (command & FTRACE_UPDATE_CALLS)
+        int *command = data;
+        /*
+         * Do not call function tracer while we update the code.
+         * We are in stop machine, no worrying about races.
+         */
+        function_trace_stop++;
+        if (*command & FTRACE_UPDATE_CALLS)
                ftrace_replace_code(1);
-        else if (command & FTRACE_DISABLE_CALLS)
+        else if (*command & FTRACE_DISABLE_CALLS)
                ftrace_replace_code(0);
-        if (command & FTRACE_UPDATE_TRACE_FUNC)
+        if (*command & FTRACE_UPDATE_TRACE_FUNC)
                ftrace_update_ftrace_func(ftrace_trace_function);
-        if (command & FTRACE_START_FUNC_RET)
+        if (*command & FTRACE_START_FUNC_RET)
                ftrace_enable_ftrace_graph_caller();
-        else if (command & FTRACE_STOP_FUNC_RET)
+        else if (*command & FTRACE_STOP_FUNC_RET)
                ftrace_disable_ftrace_graph_caller();
-}
-static int __ftrace_modify_code(void *data)
-{
-        int *command = data;
-        ftrace_modify_all_code(*command);
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+        /*
+         * For archs that call ftrace_test_stop_func(), we must
+         * wait till after we update all the function callers
+         * before we update the callback. This keeps different
+         * ops that record different functions from corrupting
+         * each other.
+         */
+        __ftrace_trace_function = __ftrace_trace_function_delay;
+#endif
+        function_trace_stop--;
        return 0;
 }
-/**
- * ftrace_run_stop_machine, go back to the stop machine method
- * @command: The command to tell ftrace what to do
- *
- * If an arch needs to fall back to the stop machine method, the
- * it can call this function.
- */
-void ftrace_run_stop_machine(int command)
-{
-        stop_machine(__ftrace_modify_code, &command, NULL);
-}
-/**
- * arch_ftrace_update_code, modify the code to trace or not trace
- * @command: The command that needs to be done
- *
- * Archs can override this function if it does not need to
- * run stop_machine() to modify code.
- */
-void __weak arch_ftrace_update_code(int command)
-{
-        ftrace_run_stop_machine(command);
-}
 static void ftrace_run_update_code(int command)
 {
        int ret;
@@ -1957,21 +1656,8 @@ static void ftrace_run_update_code(int command)
        FTRACE_WARN_ON(ret);
        if (ret)
                return;
-        /*
-         * Do not call function tracer while we update the code.
-         * We are in stop machine.
-         */
-        function_trace_stop++;
-        /*
+        stop_machine(__ftrace_modify_code, &command, NULL);
-         * By default we use stop_machine() to modify the code.
-         * But archs can do what ever they want as long as it
-         * is safe. The stop_machine() is the safest, but also
-         * produces the most overhead.
-         */
-        arch_ftrace_update_code(command);
-        function_trace_stop--;
        ret = ftrace_arch_code_modify_post_process();
        FTRACE_WARN_ON(ret);
@@ -2098,16 +1784,14 @@ static int ops_traces_mod(struct ftrace_ops *ops)
        struct ftrace_hash *hash;
        hash = ops->filter_hash;
-        return ftrace_hash_empty(hash);
+        return !!(!hash || !hash->count);
 }
 static int ftrace_update_code(struct module *mod)
 {
-        struct ftrace_page *pg;
        struct dyn_ftrace *p;
        cycle_t start, stop;
        unsigned long ref = 0;
-        int i;
        /*
         * When adding a module, we need to check if tracers are
@@ -2129,44 +1813,46 @@ static int ftrace_update_code(struct module *mod)
        start = ftrace_now(raw_smp_processor_id());
        ftrace_update_cnt = 0;
-        for (pg = ftrace_new_pgs; pg; pg = pg->next) {
+        while (ftrace_new_addrs) {
-                for (i = 0; i < pg->index; i++) {
+                /* If something went wrong, bail without enabling anything */
-                        /* If something went wrong, bail without enabling anything */
+                if (unlikely(ftrace_disabled))
-                        if (unlikely(ftrace_disabled))
+                        return -1;
-                                return -1;
-                        p = &pg->records[i];
+                p = ftrace_new_addrs;
-                        p->flags = ref;
+                ftrace_new_addrs = p->newlist;
+                p->flags = ref;
-                        /*
+                /*
-                         * Do the initial record conversion from mcount jump
+                 * Do the initial record conversion from mcount jump
-                         * to the NOP instructions.
+                 * to the NOP instructions.
-                         */
+                 */
-                        if (!ftrace_code_disable(mod, p))
+                if (!ftrace_code_disable(mod, p)) {
-                                break;
+                        ftrace_free_rec(p);
+                        /* Game over */
+                        break;
+                }
-                        ftrace_update_cnt++;
+                ftrace_update_cnt++;
-                        /*
+                /*
-                         * If the tracing is enabled, go ahead and enable the record.
+                 * If the tracing is enabled, go ahead and enable the record.
-                         *
+                 *
-                         * The reason not to enable the record immediatelly is the
+                 * The reason not to enable the record immediatelly is the
-                         * inherent check of ftrace_make_nop/ftrace_make_call for
+                 * inherent check of ftrace_make_nop/ftrace_make_call for
-                         * correct previous instructions.  Making first the NOP
+                 * correct previous instructions.  Making first the NOP
-                         * conversion puts the module to the correct state, thus
+                 * conversion puts the module to the correct state, thus
-                         * passing the ftrace_make_call check.
+                 * passing the ftrace_make_call check.
-                         */
+                 */
-                        if (ftrace_start_up && ref) {
+                if (ftrace_start_up && ref) {
-                                int failed = __ftrace_replace_code(p, 1);
+                        int failed = __ftrace_replace_code(p, 1);
-                                if (failed)
+                        if (failed) {
-                                        ftrace_bug(failed, p->ip);
+                                ftrace_bug(failed, p->ip);
+                                ftrace_free_rec(p);
                        }
                }
        }
-        ftrace_new_pgs = NULL;
        stop = ftrace_now(raw_smp_processor_id());
        ftrace_update_time = stop - start;
        ftrace_update_tot_cnt += ftrace_update_cnt;
@@ -2174,109 +1860,58 @@ static int ftrace_update_code(struct module *mod)
        return 0;
 }
-static int ftrace_allocate_records(struct ftrace_page *pg, int count)
+static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
 {
-        int order;
+        struct ftrace_page *pg;
        int cnt;
+        int i;
-        if (WARN_ON(!count))
+        /* allocate a few pages */
-                return -EINVAL;
+        ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+        if (!ftrace_pages_start)
-        order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
+                return -1;
        /*
-         * We want to fill as much as possible. No more than a page
+         * Allocate a few more pages.
-         * may be empty.
+         *
+         * TODO: have some parser search vmlinux before
+         *   final linking to find all calls to ftrace.
+         *   Then we can:
+         *    a) know how many pages to allocate.
+         *     and/or
+         *    b) set up the table then.
+         *
+         *  The dynamic code is still necessary for
+         *  modules.
         */
-        while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
-                order--;
- again:
-        pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-        if (!pg->records) {
+        pg = ftrace_pages = ftrace_pages_start;
-                /* if we can't allocate this size, try something smaller */
-                if (!order)
-                        return -ENOMEM;
-                order >>= 1;
-                goto again;
-        }
-        cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
-        pg->size = cnt;
-        if (cnt > count)
+        cnt = num_to_init / ENTRIES_PER_PAGE;
-                cnt = count;
+        pr_info("ftrace: allocating %ld entries in %d pages\n",
+                num_to_init, cnt + 1);
-        return cnt;
-}
-static struct ftrace_page *
-ftrace_allocate_pages(unsigned long num_to_init)
-{
-        struct ftrace_page *start_pg;
-        struct ftrace_page *pg;
-        int order;
-        int cnt;
-        if (!num_to_init)
-                return 0;
-        start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
-        if (!pg)
-                return NULL;
-        /*
-         * Try to allocate as much as possible in one continues
-         * location that fills in all of the space. We want to
-         * waste as little space as possible.
-         */
-        for (;;) {
-                cnt = ftrace_allocate_records(pg, num_to_init);
-                if (cnt < 0)
-                        goto free_pages;
-                num_to_init -= cnt;
+        for (i = 0; i < cnt; i++) {
-                if (!num_to_init)
+                pg->next = (void *)get_zeroed_page(GFP_KERNEL);
-                        break;
-                pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);
+                /* If we fail, we'll try later anyway */
                if (!pg->next)
-                        goto free_pages;
+                        break;
                pg = pg->next;
        }
-        return start_pg;
- free_pages:
-        while (start_pg) {
-                order = get_count_order(pg->size / ENTRIES_PER_PAGE);
-                free_pages((unsigned long)pg->records, order);
-                start_pg = pg->next;
-                kfree(pg);
-                pg = start_pg;
-        }
-        pr_info("ftrace: FAILED to allocate memory for functions\n");
-        return NULL;
-}
-static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
-{
-        int cnt;
-        if (!num_to_init) {
-                pr_info("ftrace: No functions to be traced?\n");
-                return -1;
-        }
-        cnt = num_to_init / ENTRIES_PER_PAGE;
-        pr_info("ftrace: allocating %ld entries in %d pages\n",
-                num_to_init, cnt + 1);
        return 0;
 }
+enum {
+        FTRACE_ITER_FILTER      = (1 << 0),
+        FTRACE_ITER_NOTRACE     = (1 << 1),
+        FTRACE_ITER_PRINTALL    = (1 << 2),
+        FTRACE_ITER_HASH        = (1 << 3),
+        FTRACE_ITER_ENABLED     = (1 << 4),
+};
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 struct ftrace_iterator {
@@ -2341,9 +1976,6 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
        void *p = NULL;
        loff_t l;
-        if (!(iter->flags & FTRACE_ITER_DO_HASH))
-                return NULL;
        if (iter->func_pos > *pos)
                return NULL;
@@ -2387,7 +2019,7 @@ static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
-        struct ftrace_ops *ops = iter->ops;
+        struct ftrace_ops *ops = &global_ops;
        struct dyn_ftrace *rec = NULL;
        if (unlikely(ftrace_disabled))
@@ -2411,7 +2043,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                }
        } else {
                rec = &iter->pg->records[iter->idx++];
-                if (((iter->flags & FTRACE_ITER_FILTER) &&
+                if ((rec->flags & FTRACE_FL_FREE) ||
+                    ((iter->flags & FTRACE_ITER_FILTER) &&
                     !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
                    ((iter->flags & FTRACE_ITER_NOTRACE) &&
@@ -2437,13 +2071,13 @@ static void reset_iter_read(struct ftrace_iterator *iter)
 {
        iter->pos = 0;
        iter->func_pos = 0;
-        iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);
+        iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
 }
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
-        struct ftrace_ops *ops = iter->ops;
+        struct ftrace_ops *ops = &global_ops;
        void *p = NULL;
        loff_t l;
@@ -2463,8 +2097,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
         * off, we can short cut and just print out that all
         * functions are enabled.
         */
-        if (iter->flags & FTRACE_ITER_FILTER &&
+        if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
-            ftrace_hash_empty(ops->filter_hash)) {
                if (*pos > 0)
                        return t_hash_start(m, pos);
                iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2489,8 +2122,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
                        break;
        }
-        if (!p)
+        if (!p) {
-                return t_hash_start(m, pos);
+                if (iter->flags & FTRACE_ITER_FILTER)
+                        return t_hash_start(m, pos);
+                return NULL;
+        }
        return iter;
 }
@@ -2520,9 +2157,8 @@ static int t_show(struct seq_file *m, void *v)
        seq_printf(m, "%ps", (void *)rec->ip);
        if (iter->flags & FTRACE_ITER_ENABLED)
-                seq_printf(m, " (%ld)%s",
+                seq_printf(m, " (%ld)",
-                           rec->flags & ~FTRACE_FL_MASK,
+                           rec->flags & ~FTRACE_FL_MASK);
-                           rec->flags & FTRACE_FL_REGS ? " R" : "");
        seq_printf(m, "\n");
        return 0;
@@ -2539,35 +2175,55 @@ static int
 ftrace_avail_open(struct inode *inode, struct file *file)
 {
        struct ftrace_iterator *iter;
+        int ret;
        if (unlikely(ftrace_disabled))
                return -ENODEV;
-        iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-        if (iter) {
+        if (!iter)
-                iter->pg = ftrace_pages_start;
+                return -ENOMEM;
-                iter->ops = &global_ops;
+        iter->pg = ftrace_pages_start;
+        ret = seq_open(file, &show_ftrace_seq_ops);
+        if (!ret) {
+                struct seq_file *m = file->private_data;
+                m->private = iter;
+        } else {
+                kfree(iter);
        }
-        return iter ? 0 : -ENOMEM;
+        return ret;
 }
 static int
 ftrace_enabled_open(struct inode *inode, struct file *file)
 {
        struct ftrace_iterator *iter;
+        int ret;
        if (unlikely(ftrace_disabled))
                return -ENODEV;
-        iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-        if (iter) {
+        if (!iter)
-                iter->pg = ftrace_pages_start;
+                return -ENOMEM;
-                iter->flags = FTRACE_ITER_ENABLED;
-                iter->ops = &global_ops;
+        iter->pg = ftrace_pages_start;
+        iter->flags = FTRACE_ITER_ENABLED;
+        ret = seq_open(file, &show_ftrace_seq_ops);
+        if (!ret) {
+                struct seq_file *m = file->private_data;
+                m->private = iter;
+        } else {
+                kfree(iter);
        }
-        return iter ? 0 : -ENOMEM;
+        return ret;
 }
 static void ftrace_filter_reset(struct ftrace_hash *hash)
@@ -2577,23 +2233,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
        mutex_unlock(&ftrace_lock);
 }
-/**
+static int
- * ftrace_regex_open - initialize function tracer filter files
- * @ops: The ftrace_ops that hold the hash filters
- * @flag: The type of filter to process
- * @inode: The inode, usually passed in to your open routine
- * @file: The file, usually passed in to your open routine
- *
- * ftrace_regex_open() initializes the filter files for the
- * @ops. Depending on @flag it may process the filter hash or
- * the notrace hash of @ops. With this called from the open
- * routine, you can use ftrace_filter_write() for the write
- * routine if @flag has FTRACE_ITER_FILTER set, or
- * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
- * ftrace_regex_lseek() should be used as the lseek routine, and
- * release must call ftrace_regex_release().
- */
-int
 ftrace_regex_open(struct ftrace_ops *ops, int flag,
                  struct inode *inode, struct file *file)
 {
@@ -2662,9 +2302,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 static int
 ftrace_filter_open(struct inode *inode, struct file *file)
 {
-        return ftrace_regex_open(&global_ops,
+        return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
-                        FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
+                                 inode, file);
-                        inode, file);
 }
 static int
@@ -2674,13 +2313,13 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
                                 inode, file);
 }
-loff_t
+static loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
+ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
 {
        loff_t ret;
        if (file->f_mode & FMODE_READ)
-                ret = seq_lseek(file, offset, whence);
+                ret = seq_lseek(file, offset, origin);
        else
                file->f_pos = ret = 1;
@@ -2783,6 +2422,7 @@ match_records(struct ftrace_hash *hash, char *buff,
                goto out_unlock;
        do_for_each_ftrace_rec(pg, rec) {
                if (ftrace_match_record(rec, mod, search, search_len, type)) {
                        ret = enter_record(hash, rec, not);
                        if (ret < 0) {
@@ -2868,10 +2508,10 @@ static int __init ftrace_mod_cmd_init(void)
 {
        return register_ftrace_command(&ftrace_mod_cmd);
 }
-core_initcall(ftrace_mod_cmd_init);
+device_initcall(ftrace_mod_cmd_init);
-static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
+static void
-                                      struct ftrace_ops *op, struct pt_regs *pt_regs)
+function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 {
        struct ftrace_func_probe *entry;
        struct hlist_head *hhd;
@@ -3227,14 +2867,14 @@ out_unlock:
        return ret;
 }
-ssize_t
+static ssize_t
 ftrace_filter_write(struct file *file, const char __user *ubuf,
                    size_t cnt, loff_t *ppos)
 {
        return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
 }
-ssize_t
+static ssize_t
 ftrace_notrace_write(struct file *file, const char __user *ubuf,
                     size_t cnt, loff_t *ppos)
 {
@@ -3242,27 +2882,8 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
 }
 static int
-ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
+ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
-{
+                 int reset, int enable)
-        struct ftrace_func_entry *entry;
-        if (!ftrace_location(ip))
-                return -EINVAL;
-        if (remove) {
-                entry = ftrace_lookup_ip(hash, ip);
-                if (!entry)
-                        return -ENOENT;
-                free_hash_entry(hash, entry);
-                return 0;
-        }
-        return add_hash_entry(hash, ip);
-}
-static int
-ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
-                unsigned long ip, int remove, int reset, int enable)
 {
        struct ftrace_hash **orig_hash;
        struct ftrace_hash *hash;
@@ -3287,15 +2908,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
        mutex_lock(&ftrace_regex_lock);
        if (reset)
                ftrace_filter_reset(hash);
-        if (buf && !ftrace_match_records(hash, buf, len)) {
+        if (buf)
-                ret = -EINVAL;
+                ftrace_match_records(hash, buf, len);
-                goto out_regex_unlock;
-        }
-        if (ip) {
-                ret = ftrace_match_addr(hash, ip, remove);
-                if (ret < 0)
-                        goto out_regex_unlock;
-        }
        mutex_lock(&ftrace_lock);
        ret = ftrace_hash_move(ops, enable, orig_hash, hash);
@@ -3305,44 +2919,12 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
        mutex_unlock(&ftrace_lock);
- out_regex_unlock:
        mutex_unlock(&ftrace_regex_lock);
        free_ftrace_hash(hash);
        return ret;
 }
-static int
-ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
-                int reset, int enable)
-{
-        return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable);
-}
-/**
- * ftrace_set_filter_ip - set a function to filter on in ftrace by address
- * @ops - the ops to set the filter with
- * @ip - the address to add to or remove from the filter.
- * @remove - non zero to remove the ip from the filter
- * @reset - non zero to reset all filters before applying this filter.
- *
- * Filters denote which functions should be enabled when tracing is enabled
- * If @ip is NULL, it failes to update filter.
- */
-int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
-                         int remove, int reset)
-{
-        return ftrace_set_addr(ops, ip, remove, reset, 1);
-}
-EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
-static int
-ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
-                 int reset, int enable)
-{
-        return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
-}
 /**
 * ftrace_set_filter - set a function to filter on in ftrace
 * @ops - the ops to set the filter with
@@ -3353,10 +2935,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
 * Filters denote which functions should be enabled when tracing is enabled.
 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
 */
-int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
+void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
                       int len, int reset)
 {
-        return ftrace_set_regex(ops, buf, len, reset, 1);
+        ftrace_set_regex(ops, buf, len, reset, 1);
 }
 EXPORT_SYMBOL_GPL(ftrace_set_filter);
@@ -3371,10 +2953,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
 * for tracing.
 */
-int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
+void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
                        int len, int reset)
 {
-        return ftrace_set_regex(ops, buf, len, reset, 0);
+        ftrace_set_regex(ops, buf, len, reset, 0);
 }
 EXPORT_SYMBOL_GPL(ftrace_set_notrace);
 /**
@@ -3459,8 +3041,8 @@ static void __init set_ftrace_early_graph(char *buf)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-void __init
+static void __init
-ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
+set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
 {
        char *func;
@@ -3473,16 +3055,17 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
 static void __init set_ftrace_early_filters(void)
 {
        if (ftrace_filter_buf[0])
-                ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);
+                set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
        if (ftrace_notrace_buf[0])
-                ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
+                set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        if (ftrace_graph_buf[0])
                set_ftrace_early_graph(ftrace_graph_buf);
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 }
-int ftrace_regex_release(struct inode *inode, struct file *file)
+static int
+ftrace_regex_release(struct inode *inode, struct file *file)
 {
        struct seq_file *m = (struct seq_file *)file->private_data;
        struct ftrace_iterator *iter;
@@ -3683,6 +3266,9 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        do_for_each_ftrace_rec(pg, rec) {
+                if (rec->flags & FTRACE_FL_FREE)
+                        continue;
                if (ftrace_match_record(rec, NULL, search, search_len, type)) {
                        /* if it is in the array */
                        exists = false;
@@ -3791,80 +3377,16 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
        return 0;
 }
-static int ftrace_cmp_ips(const void *a, const void *b)
-{
-        const unsigned long *ipa = a;
-        const unsigned long *ipb = b;
-        if (*ipa > *ipb)
-                return 1;
-        if (*ipa < *ipb)
-                return -1;
-        return 0;
-}
-static void ftrace_swap_ips(void *a, void *b, int size)
-{
-        unsigned long *ipa = a;
-        unsigned long *ipb = b;
-        unsigned long t;
-        t = *ipa;
-        *ipa = *ipb;
-        *ipb = t;
-}
 static int ftrace_process_locs(struct module *mod,
                               unsigned long *start,
                               unsigned long *end)
 {
-        struct ftrace_page *start_pg;
-        struct ftrace_page *pg;
-        struct dyn_ftrace *rec;
-        unsigned long count;
        unsigned long *p;
        unsigned long addr;
        unsigned long flags = 0; /* Shut up gcc */
-        int ret = -ENOMEM;
-        count = end - start;
-        if (!count)
-                return 0;
-        sort(start, count, sizeof(*start),
-             ftrace_cmp_ips, ftrace_swap_ips);
-        start_pg = ftrace_allocate_pages(count);
-        if (!start_pg)
-                return -ENOMEM;
        mutex_lock(&ftrace_lock);
-        /*
-         * Core and each module needs their own pages, as
-         * modules will free them when they are removed.
-         * Force a new page to be allocated for modules.
-         */
-        if (!mod) {
-                WARN_ON(ftrace_pages || ftrace_pages_start);
-                /* First initialization */
-                ftrace_pages = ftrace_pages_start = start_pg;
-        } else {
-                if (!ftrace_pages)
-                        goto out;
-                if (WARN_ON(ftrace_pages->next)) {
-                        /* Hmm, we have free pages? */
-                        while (ftrace_pages->next)
-                                ftrace_pages = ftrace_pages->next;
-                }
-                ftrace_pages->next = start_pg;
-        }
        p = start;
-        pg = start_pg;
        while (p < end) {
                addr = ftrace_call_adjust(*p++);
                /*
@@ -3875,27 +3397,9 @@ static int ftrace_process_locs(struct module *mod,
                 */
                if (!addr)
                        continue;
+                ftrace_record_ip(addr);
-                if (pg->index == pg->size) {
-                        /* We should have allocated enough */
-                        if (WARN_ON(!pg->next))
-                                break;
-                        pg = pg->next;
-                }
-                rec = &pg->records[pg->index++];
-                rec->ip = addr;
        }
-        /* We should have used all pages */
-        WARN_ON(pg->next);
-        /* Assign the last page to ftrace_pages */
-        ftrace_pages = pg;
-        /* These new locations need to be initialized */
-        ftrace_new_pgs = start_pg;
        /*
         * We only need to disable interrupts on start up
         * because we are modifying code that an interrupt
@@ -3909,55 +3413,32 @@ static int ftrace_process_locs(struct module *mod,
        ftrace_update_code(mod);
        if (!mod)
                local_irq_restore(flags);
-        ret = 0;
- out:
        mutex_unlock(&ftrace_lock);
-        return ret;
+        return 0;
 }
 #ifdef CONFIG_MODULES
-#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
 void ftrace_release_mod(struct module *mod)
 {
        struct dyn_ftrace *rec;
-        struct ftrace_page **last_pg;
        struct ftrace_page *pg;
-        int order;
        mutex_lock(&ftrace_lock);
        if (ftrace_disabled)
                goto out_unlock;
-        /*
+        do_for_each_ftrace_rec(pg, rec) {
-         * Each module has its own ftrace_pages, remove
-         * them from the list.
-         */
-        last_pg = &ftrace_pages_start;
-        for (pg = ftrace_pages_start; pg; pg = *last_pg) {
-                rec = &pg->records[0];
                if (within_module_core(rec->ip, mod)) {
                        /*
-                         * As core pages are first, the first
+                         * rec->ip is changed in ftrace_free_rec()
-                         * page should never be a module page.
+                         * It should not between s and e if record was freed.
                         */
-                        if (WARN_ON(pg == ftrace_pages_start))
+                        FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
-                                goto out_unlock;
+                        ftrace_free_rec(rec);
+                }
-                        /* Check if we are deleting the last page */
+        } while_for_each_ftrace_rec();
-                        if (pg == ftrace_pages)
-                                ftrace_pages = next_to_ftrace_page(last_pg);
-                        *last_pg = pg->next;
-                        order = get_count_order(pg->size / ENTRIES_PER_PAGE);
-                        free_pages((unsigned long)pg->records, order);
-                        kfree(pg);
-                } else
-                        last_pg = &pg->next;
-        }
 out_unlock:
        mutex_unlock(&ftrace_lock);
 }
@@ -4047,7 +3528,6 @@ void __init ftrace_init(void)
 static struct ftrace_ops global_ops = {
        .func                   = ftrace_stub,
-        .flags                  = FTRACE_OPS_FL_RECURSION_SAFE,
 };
 static int __init ftrace_nodyn_init(void)
@@ -4055,7 +3535,7 @@ static int __init ftrace_nodyn_init(void)
        ftrace_enabled = 1;
        return 0;
 }
-core_initcall(ftrace_nodyn_init);
+device_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
@@ -4078,44 +3558,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 static void
-ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
+ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
-                        struct ftrace_ops *op, struct pt_regs *regs)
-{
-        if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
-                return;
-        /*
-         * Some of the ops may be dynamically allocated,
-         * they must be freed after a synchronize_sched().
-         */
-        preempt_disable_notrace();
-        trace_recursion_set(TRACE_CONTROL_BIT);
-        op = rcu_dereference_raw(ftrace_control_list);
-        while (op != &ftrace_list_end) {
-                if (!ftrace_function_local_disabled(op) &&
-                    ftrace_ops_test(op, ip))
-                        op->func(ip, parent_ip, op, regs);
-                op = rcu_dereference_raw(op->next);
-        };
-        trace_recursion_clear(TRACE_CONTROL_BIT);
-        preempt_enable_notrace();
-}
-static struct ftrace_ops control_ops = {
-        .func = ftrace_ops_control_func,
-        .flags = FTRACE_OPS_FL_RECURSION_SAFE,
-};
-static inline void
-__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
-                       struct ftrace_ops *ignored, struct pt_regs *regs)
 {
        struct ftrace_ops *op;
-        if (function_trace_stop)
-                return;
        if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
                return;
@@ -4128,39 +3574,13 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
        op = rcu_dereference_raw(ftrace_ops_list);
        while (op != &ftrace_list_end) {
                if (ftrace_ops_test(op, ip))
-                        op->func(ip, parent_ip, op, regs);
+                        op->func(ip, parent_ip);
                op = rcu_dereference_raw(op->next);
        };
        preempt_enable_notrace();
        trace_recursion_clear(TRACE_INTERNAL_BIT);
 }
-/*
- * Some archs only support passing ip and parent_ip. Even though
- * the list function ignores the op parameter, we do not want any
- * C side effects, where a function is called without the caller
- * sending a third parameter.
- * Archs are to support both the regs and ftrace_ops at the same time.
- * If they support ftrace_ops, it is assumed they support regs.
- * If call backs want to use regs, they must either check for regs
- * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS.
- * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved.
- * An architecture can pass partial regs with ftrace_ops and still
- * set the ARCH_SUPPORT_FTARCE_OPS.
- */
-#if ARCH_SUPPORTS_FTRACE_OPS
-static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
-                                 struct ftrace_ops *op, struct pt_regs *regs)
-{
-        __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
-}
-#else
-static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
-{
-        __ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
-}
-#endif
 static void clear_ftrace_swapper(void)
 {
        struct task_struct *p;
@@ -4381,7 +3801,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
        if (strlen(tmp) == 0)
                return 1;
-        ret = kstrtol(tmp, 10, &val);
+        ret = strict_strtol(tmp, 10, &val);
        if (ret < 0)
                return ret;
@@ -4441,14 +3861,6 @@ void ftrace_kill(void)
 }
 /**
- * Test if ftrace is dead or not.
- */
-int ftrace_is_dead(void)
-{
-        return ftrace_disabled;
-}
-/**
 * register_ftrace_function - register a function for profiling
 * @ops - ops structure that holds the function for profiling.
 *
@@ -4465,12 +3877,16 @@ int register_ftrace_function(struct ftrace_ops *ops)
        mutex_lock(&ftrace_lock);
+        if (unlikely(ftrace_disabled))
+                goto out_unlock;
        ret = __register_ftrace_function(ops);
        if (!ret)
                ret = ftrace_startup(ops, 0);
-        mutex_unlock(&ftrace_lock);
+ out_unlock:
+        mutex_unlock(&ftrace_lock);
        return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_function);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ce8514feedc..731201bf4ac 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -23,8 +23,6 @@
 #include <asm/local.h>
 #include "trace.h"
-static void update_pages_handler(struct work_struct *work);
 /*
 * The ring buffer header is special. We must manually up keep it.
 */
@@ -156,12 +154,35 @@ enum {
 static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
-/* Used for individual buffers (after the counter) */
-#define RB_BUFFER_OFF           (1 << 20)
 #define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
 /**
+ * tracing_on - enable all tracing buffers
+ *
+ * This function enables all tracing buffers that may have been
+ * disabled with tracing_off.
+ */
+void tracing_on(void)
+{
+        set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
+}
+EXPORT_SYMBOL_GPL(tracing_on);
+/**
+ * tracing_off - turn off all tracing buffers
+ *
+ * This function stops all tracing buffers from recording data.
+ * It does not disable any overhead the tracers themselves may
+ * be causing. This function simply causes all recording to
+ * the ring buffers to fail.
+ */
+void tracing_off(void)
+{
+        clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
+}
+EXPORT_SYMBOL_GPL(tracing_off);
+/**
 * tracing_off_permanent - permanently disable ring buffers
 *
 * This function, once called, will disable all ring buffers
@@ -172,6 +193,15 @@ void tracing_off_permanent(void)
        set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
 }
+/**
+ * tracing_is_on - show state of ring buffers enabled
+ */
+int tracing_is_on(void)
+{
+        return ring_buffer_flags == RB_BUFFERS_ON;
+}
+EXPORT_SYMBOL_GPL(tracing_is_on);
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT            4U
 #define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -448,10 +478,9 @@ struct ring_buffer_per_cpu {
        int                             cpu;
        atomic_t                        record_disabled;
        struct ring_buffer              *buffer;
-        raw_spinlock_t                  reader_lock;    /* serialize readers */
+        spinlock_t                      reader_lock;    /* serialize readers */
        arch_spinlock_t                 lock;
        struct lock_class_key           lock_key;
-        unsigned int                    nr_pages;
        struct list_head                *pages;
        struct buffer_page              *head_page;     /* read from head */
        struct buffer_page              *tail_page;     /* write to tail */
@@ -459,29 +488,21 @@ struct ring_buffer_per_cpu {
        struct buffer_page              *reader_page;
        unsigned long                   lost_events;
        unsigned long                   last_overrun;
-        local_t                         entries_bytes;
-        local_t                         entries;
-        local_t                         overrun;
        local_t                         commit_overrun;
-        local_t                         dropped_events;
+        local_t                         overrun;
+        local_t                         entries;
        local_t                         committing;
        local_t                         commits;
        unsigned long                   read;
-        unsigned long                   read_bytes;
        u64                             write_stamp;
        u64                             read_stamp;
-        /* ring buffer pages to update, > 0 to add, < 0 to remove */
-        int                             nr_pages_to_update;
-        struct list_head                new_pages; /* new pages to add */
-        struct work_struct              update_pages_work;
-        struct completion               update_done;
 };
 struct ring_buffer {
+        unsigned                        pages;
        unsigned                        flags;
        int                             cpus;
        atomic_t                        record_disabled;
-        atomic_t                        resize_disabled;
        cpumask_var_t                   cpumask;
        struct lock_class_key           *reader_lock_key;
@@ -946,10 +967,6 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
        struct list_head *head = cpu_buffer->pages;
        struct buffer_page *bpage, *tmp;
-        /* Reset the head page if it exists */
-        if (cpu_buffer->head_page)
-                rb_set_head_page(cpu_buffer);
        rb_head_page_deactivate(cpu_buffer);
        if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
@@ -976,10 +993,14 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
        return 0;
 }
-static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+                             unsigned nr_pages)
 {
-        int i;
        struct buffer_page *bpage, *tmp;
+        LIST_HEAD(pages);
+        unsigned i;
+        WARN_ON(!nr_pages);
        for (i = 0; i < nr_pages; i++) {
                struct page *page;
@@ -990,13 +1011,15 @@ static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
                 */
                bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
                                    GFP_KERNEL | __GFP_NORETRY,
-                                    cpu_to_node(cpu));
+                                    cpu_to_node(cpu_buffer->cpu));
                if (!bpage)
                        goto free_pages;
-                list_add(&bpage->list, pages);
+                rb_check_bpage(cpu_buffer, bpage);
-                page = alloc_pages_node(cpu_to_node(cpu),
+                list_add(&bpage->list, &pages);
+                page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
                                        GFP_KERNEL | __GFP_NORETRY, 0);
                if (!page)
                        goto free_pages;
@@ -1004,27 +1027,6 @@ static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
                rb_init_page(bpage->page);
        }
-        return 0;
-free_pages:
-        list_for_each_entry_safe(bpage, tmp, pages, list) {
-                list_del_init(&bpage->list);
-                free_buffer_page(bpage);
-        }
-        return -ENOMEM;
-}
-static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
-                             unsigned nr_pages)
-{
-        LIST_HEAD(pages);
-        WARN_ON(!nr_pages);
-        if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
-                return -ENOMEM;
        /*
         * The ring buffer page list is a circular list that does not
         * start and end with a list head. All page list items point to
@@ -1033,15 +1035,20 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
        cpu_buffer->pages = pages.next;
        list_del(&pages);
-        cpu_buffer->nr_pages = nr_pages;
        rb_check_pages(cpu_buffer);
        return 0;
+ free_pages:
+        list_for_each_entry_safe(bpage, tmp, &pages, list) {
+                list_del_init(&bpage->list);
+                free_buffer_page(bpage);
+        }
+        return -ENOMEM;
 }
 static struct ring_buffer_per_cpu *
-rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct buffer_page *bpage;
@@ -1055,11 +1062,9 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
        cpu_buffer->cpu = cpu;
        cpu_buffer->buffer = buffer;
-        raw_spin_lock_init(&cpu_buffer->reader_lock);
+        spin_lock_init(&cpu_buffer->reader_lock);
        lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
        cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-        INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
-        init_completion(&cpu_buffer->update_done);
        bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
                            GFP_KERNEL, cpu_to_node(cpu));
@@ -1076,9 +1081,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
        rb_init_page(bpage->page);
        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
-        INIT_LIST_HEAD(&cpu_buffer->new_pages);
-        ret = rb_allocate_pages(cpu_buffer, nr_pages);
+        ret = rb_allocate_pages(cpu_buffer, buffer->pages);
        if (ret < 0)
                goto fail_free_reader;
@@ -1139,7 +1143,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 {
        struct ring_buffer *buffer;
        int bsize;
-        int cpu, nr_pages;
+        int cpu;
        /* keep it in its own cache line */
        buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1150,14 +1154,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
        if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
                goto fail_free_buffer;
-        nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+        buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
        buffer->flags = flags;
        buffer->clock = trace_clock_local;
        buffer->reader_lock_key = key;
        /* need at least two pages */
-        if (nr_pages < 2)
+        if (buffer->pages < 2)
-                nr_pages = 2;
+                buffer->pages = 2;
        /*
         * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1180,7 +1184,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
        for_each_buffer_cpu(buffer, cpu) {
                buffer->buffers[cpu] =
-                        rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
+                        rb_allocate_cpu_buffer(buffer, cpu);
                if (!buffer->buffers[cpu])
                        goto fail_free_buffers;
        }
@@ -1248,223 +1252,58 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
-static inline unsigned long rb_page_entries(struct buffer_page *bpage)
+static void
-{
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
-        return local_read(&bpage->entries) & RB_WRITE_MASK;
-}
-static inline unsigned long rb_page_write(struct buffer_page *bpage)
-{
-        return local_read(&bpage->write) & RB_WRITE_MASK;
-}
-static int
-rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
 {
-        struct list_head *tail_page, *to_remove, *next_page;
+        struct buffer_page *bpage;
-        struct buffer_page *to_remove_page, *tmp_iter_page;
+        struct list_head *p;
-        struct buffer_page *last_page, *first_page;
+        unsigned i;
-        unsigned int nr_removed;
-        unsigned long head_bit;
-        int page_entries;
-        head_bit = 0;
-        raw_spin_lock_irq(&cpu_buffer->reader_lock);
-        atomic_inc(&cpu_buffer->record_disabled);
-        /*
-         * We don't race with the readers since we have acquired the reader
-         * lock. We also don't race with writers after disabling recording.
-         * This makes it easy to figure out the first and the last page to be
-         * removed from the list. We unlink all the pages in between including
-         * the first and last pages. This is done in a busy loop so that we
-         * lose the least number of traces.
-         * The pages are freed after we restart recording and unlock readers.
-         */
-        tail_page = &cpu_buffer->tail_page->list;
-        /*
-         * tail page might be on reader page, we remove the next page
-         * from the ring buffer
-         */
-        if (cpu_buffer->tail_page == cpu_buffer->reader_page)
-                tail_page = rb_list_head(tail_page->next);
-        to_remove = tail_page;
-        /* start of pages to remove */
+        spin_lock_irq(&cpu_buffer->reader_lock);
-        first_page = list_entry(rb_list_head(to_remove->next),
+        rb_head_page_deactivate(cpu_buffer);
-                                struct buffer_page, list);
-        for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) {
+        for (i = 0; i < nr_pages; i++) {
-                to_remove = rb_list_head(to_remove)->next;
+                if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-                head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
+                        goto out;
+                p = cpu_buffer->pages->next;
+                bpage = list_entry(p, struct buffer_page, list);
+                list_del_init(&bpage->list);
+                free_buffer_page(bpage);
        }
+        if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
+                goto out;
-        next_page = rb_list_head(to_remove)->next;
+        rb_reset_cpu(cpu_buffer);
+        rb_check_pages(cpu_buffer);
-        /*
-         * Now we remove all pages between tail_page and next_page.
-         * Make sure that we have head_bit value preserved for the
-         * next page
-         */
-        tail_page->next = (struct list_head *)((unsigned long)next_page |
-                                                head_bit);
-        next_page = rb_list_head(next_page);
-        next_page->prev = tail_page;
-        /* make sure pages points to a valid page in the ring buffer */
-        cpu_buffer->pages = next_page;
-        /* update head page */
-        if (head_bit)
-                cpu_buffer->head_page = list_entry(next_page,
-                                                struct buffer_page, list);
-        /*
-         * change read pointer to make sure any read iterators reset
-         * themselves
-         */
-        cpu_buffer->read = 0;
-        /* pages are removed, resume tracing and then free the pages */
-        atomic_dec(&cpu_buffer->record_disabled);
-        raw_spin_unlock_irq(&cpu_buffer->reader_lock);
-        RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages));
-        /* last buffer page to remove */
-        last_page = list_entry(rb_list_head(to_remove), struct buffer_page,
-                                list);
-        tmp_iter_page = first_page;
-        do {
-                to_remove_page = tmp_iter_page;
-                rb_inc_page(cpu_buffer, &tmp_iter_page);
-                /* update the counters */
-                page_entries = rb_page_entries(to_remove_page);
-                if (page_entries) {
-                        /*
-                         * If something was added to this page, it was full
-                         * since it is not the tail page. So we deduct the
-                         * bytes consumed in ring buffer from here.
-                         * Increment overrun to account for the lost events.
-                         */
-                        local_add(page_entries, &cpu_buffer->overrun);
-                        local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
-                }
-                /*
-                 * We have already removed references to this list item, just
-                 * free up the buffer_page and its page
-                 */
-                free_buffer_page(to_remove_page);
-                nr_removed--;
-        } while (to_remove_page != last_page);
-        RB_WARN_ON(cpu_buffer, nr_removed);
-        return nr_removed == 0;
+out:
+        spin_unlock_irq(&cpu_buffer->reader_lock);
 }
-static int
+static void
-rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
+                struct list_head *pages, unsigned nr_pages)
 {
-        struct list_head *pages = &cpu_buffer->new_pages;
+        struct buffer_page *bpage;
-        int retries, success;
+        struct list_head *p;
+        unsigned i;
-        raw_spin_lock_irq(&cpu_buffer->reader_lock);
-        /*
-         * We are holding the reader lock, so the reader page won't be swapped
-         * in the ring buffer. Now we are racing with the writer trying to
-         * move head page and the tail page.
-         * We are going to adapt the reader page update process where:
-         * 1. We first splice the start and end of list of new pages between
-         *    the head page and its previous page.
-         * 2. We cmpxchg the prev_page->next to point from head page to the
-         *    start of new pages list.
-         * 3. Finally, we update the head->prev to the end of new list.
-         *
-         * We will try this process 10 times, to make sure that we don't keep
-         * spinning.
-         */
-        retries = 10;
-        success = 0;
-        while (retries--) {
-                struct list_head *head_page, *prev_page, *r;
-                struct list_head *last_page, *first_page;
-                struct list_head *head_page_with_bit;
-                head_page = &rb_set_head_page(cpu_buffer)->list;
-                if (!head_page)
-                        break;
-                prev_page = head_page->prev;
-                first_page = pages->next;
-                last_page  = pages->prev;
-                head_page_with_bit = (struct list_head *)
-                                     ((unsigned long)head_page | RB_PAGE_HEAD);
-                last_page->next = head_page_with_bit;
-                first_page->prev = prev_page;
-                r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
-                if (r == head_page_with_bit) {
+        spin_lock_irq(&cpu_buffer->reader_lock);
-                        /*
+        rb_head_page_deactivate(cpu_buffer);
-                         * yay, we replaced the page pointer to our new list,
-                         * now, we just have to update to head page's prev
-                         * pointer to point to end of list
-                         */
-                        head_page->prev = last_page;
-                        success = 1;
-                        break;
-                }
-        }
-        if (success)
+        for (i = 0; i < nr_pages; i++) {
-                INIT_LIST_HEAD(pages);
+                if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
-        /*
+                        goto out;
-         * If we weren't successful in adding in new pages, warn and stop
+                p = pages->next;
-         * tracing
+                bpage = list_entry(p, struct buffer_page, list);
-         */
+                list_del_init(&bpage->list);
-        RB_WARN_ON(cpu_buffer, !success);
+                list_add_tail(&bpage->list, cpu_buffer->pages);
-        raw_spin_unlock_irq(&cpu_buffer->reader_lock);
-        /* free pages if they weren't inserted */
-        if (!success) {
-                struct buffer_page *bpage, *tmp;
-                list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
-                                         list) {
-                        list_del_init(&bpage->list);
-                        free_buffer_page(bpage);
-                }
        }
-        return success;
+        rb_reset_cpu(cpu_buffer);
-}
+        rb_check_pages(cpu_buffer);
-static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
-{
-        int success;
-        if (cpu_buffer->nr_pages_to_update > 0)
-                success = rb_insert_pages(cpu_buffer);
-        else
-                success = rb_remove_pages(cpu_buffer,
-                                        -cpu_buffer->nr_pages_to_update);
-        if (success)
-                cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
-}
-static void update_pages_handler(struct work_struct *work)
+out:
-{
+        spin_unlock_irq(&cpu_buffer->reader_lock);
-        struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
-                        struct ring_buffer_per_cpu, update_pages_work);
-        rb_update_pages(cpu_buffer);
-        complete(&cpu_buffer->update_done);
 }
 /**
@@ -1474,14 +1313,16 @@ static void update_pages_handler(struct work_struct *work)
 *
 * Minimum size is 2 * BUF_PAGE_SIZE.
 *
- * Returns 0 on success and < 0 on failure.
+ * Returns -1 on failure.
 */
-int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
-                        int cpu_id)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
-        unsigned nr_pages;
+        unsigned nr_pages, rm_pages, new_pages;
-        int cpu, err = 0;
+        struct buffer_page *bpage, *tmp;
+        unsigned long buffer_size;
+        LIST_HEAD(pages);
+        int i, cpu;
        /*
         * Always succeed at resizing a non-existent buffer:
@@ -1489,165 +1330,115 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
        if (!buffer)
                return size;
-        /* Make sure the requested buffer exists */
-        if (cpu_id != RING_BUFFER_ALL_CPUS &&
-            !cpumask_test_cpu(cpu_id, buffer->cpumask))
-                return size;
        size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
        size *= BUF_PAGE_SIZE;
+        buffer_size = buffer->pages * BUF_PAGE_SIZE;
        /* we need a minimum of two pages */
        if (size < BUF_PAGE_SIZE * 2)
                size = BUF_PAGE_SIZE * 2;
-        nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+        if (size == buffer_size)
+                return size;
-        /*
+        atomic_inc(&buffer->record_disabled);
-         * Don't succeed if resizing is disabled, as a reader might be
-         * manipulating the ring buffer and is expecting a sane state while
+        /* Make sure all writers are done with this buffer. */
-         * this is true.
+        synchronize_sched();
-         */
-        if (atomic_read(&buffer->resize_disabled))
-                return -EBUSY;
-        /* prevent another thread from changing buffer sizes */
        mutex_lock(&buffer->mutex);
+        get_online_cpus();
-        if (cpu_id == RING_BUFFER_ALL_CPUS) {
+        nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
-                /* calculate the pages to update */
-                for_each_buffer_cpu(buffer, cpu) {
-                        cpu_buffer = buffer->buffers[cpu];
-                        cpu_buffer->nr_pages_to_update = nr_pages -
+        if (size < buffer_size) {
-                                                        cpu_buffer->nr_pages;
-                        /*
-                         * nothing more to do for removing pages or no update
-                         */
-                        if (cpu_buffer->nr_pages_to_update <= 0)
-                                continue;
-                        /*
-                         * to add pages, make sure all new pages can be
-                         * allocated without receiving ENOMEM
-                         */
-                        INIT_LIST_HEAD(&cpu_buffer->new_pages);
-                        if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
-                                                &cpu_buffer->new_pages, cpu)) {
-                                /* not enough memory for new pages */
-                                err = -ENOMEM;
-                                goto out_err;
-                        }
-                }
-                get_online_cpus();
+                /* easy case, just free pages */
-                /*
+                if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
-                 * Fire off all the required work handlers
+                        goto out_fail;
-                 * We can't schedule on offline CPUs, but it's not necessary
-                 * since we can change their buffer sizes without any race.
+                rm_pages = buffer->pages - nr_pages;
-                 */
-                for_each_buffer_cpu(buffer, cpu) {
-                        cpu_buffer = buffer->buffers[cpu];
-                        if (!cpu_buffer->nr_pages_to_update)
-                                continue;
-                        if (cpu_online(cpu))
-                                schedule_work_on(cpu,
-                                                &cpu_buffer->update_pages_work);
-                        else
-                                rb_update_pages(cpu_buffer);
-                }
-                /* wait for all the updates to complete */
                for_each_buffer_cpu(buffer, cpu) {
                        cpu_buffer = buffer->buffers[cpu];
-                        if (!cpu_buffer->nr_pages_to_update)
+                        rb_remove_pages(cpu_buffer, rm_pages);
-                                continue;
-                        if (cpu_online(cpu))
-                                wait_for_completion(&cpu_buffer->update_done);
-                        cpu_buffer->nr_pages_to_update = 0;
                }
+                goto out;
+        }
-                put_online_cpus();
+        /*
-        } else {
+         * This is a bit more difficult. We only want to add pages
-                /* Make sure this CPU has been intitialized */
+         * when we can allocate enough for all CPUs. We do this
-                if (!cpumask_test_cpu(cpu_id, buffer->cpumask))
+         * by allocating all the pages and storing them on a local
-                        goto out;
+         * link list. If we succeed in our allocation, then we
+         * add these pages to the cpu_buffers. Otherwise we just free
-                cpu_buffer = buffer->buffers[cpu_id];
+         * them all and return -ENOMEM;
+         */
-                if (nr_pages == cpu_buffer->nr_pages)
+        if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
-                        goto out;
+                goto out_fail;
-                cpu_buffer->nr_pages_to_update = nr_pages -
+        new_pages = nr_pages - buffer->pages;
-                                                cpu_buffer->nr_pages;
-                INIT_LIST_HEAD(&cpu_buffer->new_pages);
+        for_each_buffer_cpu(buffer, cpu) {
-                if (cpu_buffer->nr_pages_to_update > 0 &&
+                for (i = 0; i < new_pages; i++) {
-                        __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
+                        struct page *page;
-                                            &cpu_buffer->new_pages, cpu_id)) {
+                        /*
-                        err = -ENOMEM;
+                         * __GFP_NORETRY flag makes sure that the allocation
-                        goto out_err;
+                         * fails gracefully without invoking oom-killer and
+                         * the system is not destabilized.
+                         */
+                        bpage = kzalloc_node(ALIGN(sizeof(*bpage),
+                                                  cache_line_size()),
+                                            GFP_KERNEL | __GFP_NORETRY,
+                                            cpu_to_node(cpu));
+                        if (!bpage)
+                                goto free_pages;
+                        list_add(&bpage->list, &pages);
+                        page = alloc_pages_node(cpu_to_node(cpu),
+                                                GFP_KERNEL | __GFP_NORETRY, 0);
+                        if (!page)
+                                goto free_pages;
+                        bpage->page = page_address(page);
+                        rb_init_page(bpage->page);
                }
-                get_online_cpus();
-                if (cpu_online(cpu_id)) {
-                        schedule_work_on(cpu_id,
-                                         &cpu_buffer->update_pages_work);
-                        wait_for_completion(&cpu_buffer->update_done);
-                } else
-                        rb_update_pages(cpu_buffer);
-                cpu_buffer->nr_pages_to_update = 0;
-                put_online_cpus();
        }
- out:
+        for_each_buffer_cpu(buffer, cpu) {
-        /*
+                cpu_buffer = buffer->buffers[cpu];
-         * The ring buffer resize can happen with the ring buffer
+                rb_insert_pages(cpu_buffer, &pages, new_pages);
-         * enabled, so that the update disturbs the tracing as little
-         * as possible. But if the buffer is disabled, we do not need
-         * to worry about that, and we can take the time to verify
-         * that the buffer is not corrupt.
-         */
-        if (atomic_read(&buffer->record_disabled)) {
-                atomic_inc(&buffer->record_disabled);
-                /*
-                 * Even though the buffer was disabled, we must make sure
-                 * that it is truly disabled before calling rb_check_pages.
-                 * There could have been a race between checking
-                 * record_disable and incrementing it.
-                 */
-                synchronize_sched();
-                for_each_buffer_cpu(buffer, cpu) {
-                        cpu_buffer = buffer->buffers[cpu];
-                        rb_check_pages(cpu_buffer);
-                }
-                atomic_dec(&buffer->record_disabled);
        }
-        mutex_unlock(&buffer->mutex);
+        if (RB_WARN_ON(buffer, !list_empty(&pages)))
-        return size;
+                goto out_fail;
- out_err:
+ out:
-        for_each_buffer_cpu(buffer, cpu) {
+        buffer->pages = nr_pages;
-                struct buffer_page *bpage, *tmp;
+        put_online_cpus();
+        mutex_unlock(&buffer->mutex);
-                cpu_buffer = buffer->buffers[cpu];
+        atomic_dec(&buffer->record_disabled);
-                cpu_buffer->nr_pages_to_update = 0;
-                if (list_empty(&cpu_buffer->new_pages))
+        return size;
-                        continue;
-                list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
+ free_pages:
-                                        list) {
+        list_for_each_entry_safe(bpage, tmp, &pages, list) {
-                        list_del_init(&bpage->list);
+                list_del_init(&bpage->list);
-                        free_buffer_page(bpage);
+                free_buffer_page(bpage);
-                }
        }
+        put_online_cpus();
        mutex_unlock(&buffer->mutex);
-        return err;
+        atomic_dec(&buffer->record_disabled);
+        return -ENOMEM;
+        /*
+         * Something went totally wrong, and we are too paranoid
+         * to even clean up the mess.
+         */
+ out_fail:
+        put_online_cpus();
+        mutex_unlock(&buffer->mutex);
+        atomic_dec(&buffer->record_disabled);
+        return -1;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
@@ -1686,11 +1477,21 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
        return __rb_page_index(iter->head_page, iter->head);
 }
+static inline unsigned long rb_page_write(struct buffer_page *bpage)
+{
+        return local_read(&bpage->write) & RB_WRITE_MASK;
+}
 static inline unsigned rb_page_commit(struct buffer_page *bpage)
 {
        return local_read(&bpage->page->commit);
 }
+static inline unsigned long rb_page_entries(struct buffer_page *bpage)
+{
+        return local_read(&bpage->entries) & RB_WRITE_MASK;
+}
 /* Size is determined by what has been committed */
 static inline unsigned rb_page_size(struct buffer_page *bpage)
 {
@@ -1739,7 +1540,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
         * assign the commit to the tail.
         */
 again:
-        max_count = cpu_buffer->nr_pages * 100;
+        max_count = cpu_buffer->buffer->pages * 100;
        while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
                if (RB_WARN_ON(cpu_buffer, !(--max_count)))
@@ -1823,7 +1624,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
 }
 /**
- * rb_update_event - update event type and data
+ * ring_buffer_update_event - update event type and data
 * @event: the even to update
 * @type: the type of event
 * @length: the size of the event field in the ring buffer
@@ -1907,7 +1708,6 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
                 * the counters.
                 */
                local_add(entries, &cpu_buffer->overrun);
-                local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
                /*
                 * The entries will be zeroed out when we move the
@@ -2063,9 +1863,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
        event = __rb_page_index(tail_page, tail);
        kmemcheck_annotate_bitfield(event, bitfield);
-        /* account for padding bytes */
-        local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
        /*
         * Save the original length to the meta data.
         * This will be used by the reader to add lost event
@@ -2158,10 +1955,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
                         * If we are not in overwrite mode,
                         * this is easy, just stop here.
                         */
-                        if (!(buffer->flags & RB_FL_OVERWRITE)) {
+                        if (!(buffer->flags & RB_FL_OVERWRITE))
-                                local_inc(&cpu_buffer->dropped_events);
                                goto out_reset;
-                        }
                        ret = rb_handle_head_page(cpu_buffer,
                                                  tail_page,
@@ -2259,9 +2054,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        if (!tail)
                tail_page->page->time_stamp = ts;
-        /* account for these added bytes */
-        local_add(length, &cpu_buffer->entries_bytes);
        return event;
 }
@@ -2284,7 +2076,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
        if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
                unsigned long write_mask =
                        local_read(&bpage->write) & ~RB_WRITE_MASK;
-                unsigned long event_length = rb_event_length(event);
                /*
                 * This is on the tail page. It is possible that
                 * a write could come in and move the tail page
@@ -2294,11 +2085,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
                old_index += write_mask;
                new_index += write_mask;
                index = local_cmpxchg(&bpage->write, old_index, new_index);
-                if (index == old_index) {
+                if (index == old_index)
-                        /* update counters */
-                        local_sub(event_length, &cpu_buffer->entries_bytes);
                        return 1;
-                }
        }
        /* could not discard */
@@ -2725,8 +2513,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
 * and not the length of the event which would hold the header.
 */
 int ring_buffer_write(struct ring_buffer *buffer,
-                      unsigned long length,
+                        unsigned long length,
-                      void *data)
+                        void *data)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event;
@@ -2818,63 +2606,6 @@ void ring_buffer_record_enable(struct ring_buffer *buffer)
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
 /**
- * ring_buffer_record_off - stop all writes into the buffer
- * @buffer: The ring buffer to stop writes to.
- *
- * This prevents all writes to the buffer. Any attempt to write
- * to the buffer after this will fail and return NULL.
- *
- * This is different than ring_buffer_record_disable() as
- * it works like an on/off switch, where as the disable() version
- * must be paired with a enable().
- */
-void ring_buffer_record_off(struct ring_buffer *buffer)
-{
-        unsigned int rd;
-        unsigned int new_rd;
-        do {
-                rd = atomic_read(&buffer->record_disabled);
-                new_rd = rd | RB_BUFFER_OFF;
-        } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
-}
-EXPORT_SYMBOL_GPL(ring_buffer_record_off);
-/**
- * ring_buffer_record_on - restart writes into the buffer
- * @buffer: The ring buffer to start writes to.
- *
- * This enables all writes to the buffer that was disabled by
- * ring_buffer_record_off().
- *
- * This is different than ring_buffer_record_enable() as
- * it works like an on/off switch, where as the enable() version
- * must be paired with a disable().
- */
-void ring_buffer_record_on(struct ring_buffer *buffer)
-{
-        unsigned int rd;
-        unsigned int new_rd;
-        do {
-                rd = atomic_read(&buffer->record_disabled);
-                new_rd = rd & ~RB_BUFFER_OFF;
-        } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
-}
-EXPORT_SYMBOL_GPL(ring_buffer_record_on);
-/**
- * ring_buffer_record_is_on - return true if the ring buffer can write
- * @buffer: The ring buffer to see if write is enabled
- *
- * Returns true if the ring buffer is in a state that it accepts writes.
- */
-int ring_buffer_record_is_on(struct ring_buffer *buffer)
-{
-        return !atomic_read(&buffer->record_disabled);
-}
-/**
 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
 * @buffer: The ring buffer to stop writes to.
 * @cpu: The CPU buffer to stop
@@ -2930,59 +2661,6 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
 }
 /**
- * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
- * @buffer: The ring buffer
- * @cpu: The per CPU buffer to read from.
- */
-u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
-{
-        unsigned long flags;
-        struct ring_buffer_per_cpu *cpu_buffer;
-        struct buffer_page *bpage;
-        u64 ret = 0;
-        if (!cpumask_test_cpu(cpu, buffer->cpumask))
-                return 0;
-        cpu_buffer = buffer->buffers[cpu];
-        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-        /*
-         * if the tail is on reader_page, oldest time stamp is on the reader
-         * page
-         */
-        if (cpu_buffer->tail_page == cpu_buffer->reader_page)
-                bpage = cpu_buffer->reader_page;
-        else
-                bpage = rb_set_head_page(cpu_buffer);
-        if (bpage)
-                ret = bpage->page->time_stamp;
-        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
-/**
- * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
- * @buffer: The ring buffer
- * @cpu: The per CPU buffer to read from.
- */
-unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
-{
-        struct ring_buffer_per_cpu *cpu_buffer;
-        unsigned long ret;
-        if (!cpumask_test_cpu(cpu, buffer->cpumask))
-                return 0;
-        cpu_buffer = buffer->buffers[cpu];
-        ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
-        return ret;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
-/**
 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
 * @buffer: The ring buffer
 * @cpu: The per CPU buffer to get the entries from.
@@ -3001,8 +2679,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 /**
- * ring_buffer_overrun_cpu - get the number of overruns caused by the ring
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
- * buffer wrapping around (only if RB_FL_OVERWRITE is on).
 * @buffer: The ring buffer
 * @cpu: The per CPU buffer to get the number of overruns from
 */
@@ -3022,9 +2699,7 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
 /**
- * ring_buffer_commit_overrun_cpu - get the number of overruns caused by
+ * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
- * commits failing due to the buffer wrapping around while there are uncommitted
- * events, such as during an interrupt storm.
 * @buffer: The ring buffer
 * @cpu: The per CPU buffer to get the number of overruns from
 */
@@ -3045,28 +2720,6 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
 EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
 /**
- * ring_buffer_dropped_events_cpu - get the number of dropped events caused by
- * the ring buffer filling up (only if RB_FL_OVERWRITE is off).
- * @buffer: The ring buffer
- * @cpu: The per CPU buffer to get the number of overruns from
- */
-unsigned long
-ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
-{
-        struct ring_buffer_per_cpu *cpu_buffer;
-        unsigned long ret;
-        if (!cpumask_test_cpu(cpu, buffer->cpumask))
-                return 0;
-        cpu_buffer = buffer->buffers[cpu];
-        ret = local_read(&cpu_buffer->dropped_events);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
-/**
 * ring_buffer_entries - get the number of entries in a buffer
 * @buffer: The ring buffer
 *
@@ -3151,9 +2804,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
        cpu_buffer = iter->cpu_buffer;
-        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        rb_iter_reset(iter);
-        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
@@ -3274,10 +2927,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        if (cpu_buffer->commit_page == cpu_buffer->reader_page)
                goto out;
-        /* Don't bother swapping if the ring buffer is empty */
-        if (rb_num_of_entries(cpu_buffer) == 0)
-                goto out;
        /*
         * Reset the reader page to size zero.
         */
@@ -3291,8 +2940,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
         * Splice the empty reader page into the list around the head.
         */
        reader = rb_set_head_page(cpu_buffer);
-        if (!reader)
-                goto out;
        cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
        cpu_buffer->reader_page->list.prev = reader->list.prev;
@@ -3618,12 +3265,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
 again:
        local_irq_save(flags);
        if (dolock)
-                raw_spin_lock(&cpu_buffer->reader_lock);
+                spin_lock(&cpu_buffer->reader_lock);
        event = rb_buffer_peek(cpu_buffer, ts, lost_events);
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
                rb_advance_reader(cpu_buffer);
        if (dolock)
-                raw_spin_unlock(&cpu_buffer->reader_lock);
+                spin_unlock(&cpu_buffer->reader_lock);
        local_irq_restore(flags);
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
@@ -3648,9 +3295,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        unsigned long flags;
 again:
-        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        event = rb_iter_peek(iter, ts);
-        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
                goto again;
@@ -3690,7 +3337,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
        cpu_buffer = buffer->buffers[cpu];
        local_irq_save(flags);
        if (dolock)
-                raw_spin_lock(&cpu_buffer->reader_lock);
+                spin_lock(&cpu_buffer->reader_lock);
        event = rb_buffer_peek(cpu_buffer, ts, lost_events);
        if (event) {
@@ -3699,7 +3346,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
        }
        if (dolock)
-                raw_spin_unlock(&cpu_buffer->reader_lock);
+                spin_unlock(&cpu_buffer->reader_lock);
        local_irq_restore(flags);
 out:
@@ -3749,7 +3396,6 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
        iter->cpu_buffer = cpu_buffer;
-        atomic_inc(&buffer->resize_disabled);
        atomic_inc(&cpu_buffer->record_disabled);
        return iter;
@@ -3792,11 +3438,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
        cpu_buffer = iter->cpu_buffer;
-        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        arch_spin_lock(&cpu_buffer->lock);
        rb_iter_reset(iter);
        arch_spin_unlock(&cpu_buffer->lock);
-        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -3811,20 +3457,8 @@ void
 ring_buffer_read_finish(struct ring_buffer_iter *iter)
 {
        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
-        unsigned long flags;
-        /*
-         * Ring buffer is disabled from recording, here's a good place
-         * to check the integrity of the ring buffer.
-         * Must prevent readers from trying to read, as the check
-         * clears the HEAD page and readers require it.
-         */
-        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-        rb_check_pages(cpu_buffer);
-        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        atomic_dec(&cpu_buffer->record_disabled);
-        atomic_dec(&cpu_buffer->buffer->resize_disabled);
        kfree(iter);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
@@ -3843,7 +3477,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
        unsigned long flags;
-        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 again:
        event = rb_iter_peek(iter, ts);
        if (!event)
@@ -3854,7 +3488,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
        rb_advance_iter(iter);
 out:
-        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        return event;
 }
@@ -3864,18 +3498,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);
 * ring_buffer_size - return the size of the ring buffer (in bytes)
 * @buffer: The ring buffer.
 */
-unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
 {
-        /*
+        return BUF_PAGE_SIZE * buffer->pages;
-         * Earlier, this method returned
-         *      BUF_PAGE_SIZE * buffer->nr_pages
-         * Since the nr_pages field is now removed, we have converted this to
-         * return the per cpu buffer value.
-         */
-        if (!cpumask_test_cpu(cpu, buffer->cpumask))
-                return 0;
-        return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_size);
@@ -3896,21 +3521,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->commit_page = cpu_buffer->head_page;
        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
-        INIT_LIST_HEAD(&cpu_buffer->new_pages);
        local_set(&cpu_buffer->reader_page->write, 0);
        local_set(&cpu_buffer->reader_page->entries, 0);
        local_set(&cpu_buffer->reader_page->page->commit, 0);
        cpu_buffer->reader_page->read = 0;
-        local_set(&cpu_buffer->entries_bytes, 0);
-        local_set(&cpu_buffer->overrun, 0);
        local_set(&cpu_buffer->commit_overrun, 0);
-        local_set(&cpu_buffer->dropped_events, 0);
+        local_set(&cpu_buffer->overrun, 0);
        local_set(&cpu_buffer->entries, 0);
        local_set(&cpu_buffer->committing, 0);
        local_set(&cpu_buffer->commits, 0);
        cpu_buffer->read = 0;
-        cpu_buffer->read_bytes = 0;
        cpu_buffer->write_stamp = 0;
        cpu_buffer->read_stamp = 0;
@@ -3934,13 +3555,9 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return;
-        atomic_inc(&buffer->resize_disabled);
        atomic_inc(&cpu_buffer->record_disabled);
-        /* Make sure all commits have finished */
+        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-        synchronize_sched();
-        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
                goto out;
@@ -3952,10 +3569,9 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        arch_spin_unlock(&cpu_buffer->lock);
 out:
-        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        atomic_dec(&cpu_buffer->record_disabled);
-        atomic_dec(&buffer->resize_disabled);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -3991,10 +3607,10 @@ int ring_buffer_empty(struct ring_buffer *buffer)
                cpu_buffer = buffer->buffers[cpu];
                local_irq_save(flags);
                if (dolock)
-                        raw_spin_lock(&cpu_buffer->reader_lock);
+                        spin_lock(&cpu_buffer->reader_lock);
                ret = rb_per_cpu_empty(cpu_buffer);
                if (dolock)
-                        raw_spin_unlock(&cpu_buffer->reader_lock);
+                        spin_unlock(&cpu_buffer->reader_lock);
                local_irq_restore(flags);
                if (!ret)
@@ -4025,10 +3641,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
        cpu_buffer = buffer->buffers[cpu];
        local_irq_save(flags);
        if (dolock)
-                raw_spin_lock(&cpu_buffer->reader_lock);
+                spin_lock(&cpu_buffer->reader_lock);
        ret = rb_per_cpu_empty(cpu_buffer);
        if (dolock)
-                raw_spin_unlock(&cpu_buffer->reader_lock);
+                spin_unlock(&cpu_buffer->reader_lock);
        local_irq_restore(flags);
        return ret;
@@ -4057,11 +3673,8 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
            !cpumask_test_cpu(cpu, buffer_b->cpumask))
                goto out;
-        cpu_buffer_a = buffer_a->buffers[cpu];
-        cpu_buffer_b = buffer_b->buffers[cpu];
        /* At least make sure the two buffers are somewhat the same */
-        if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
+        if (buffer_a->pages != buffer_b->pages)
                goto out;
        ret = -EAGAIN;
@@ -4075,6 +3688,9 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
        if (atomic_read(&buffer_b->record_disabled))
                goto out;
+        cpu_buffer_a = buffer_a->buffers[cpu];
+        cpu_buffer_b = buffer_b->buffers[cpu];
        if (atomic_read(&cpu_buffer_a->record_disabled))
                goto out;
@@ -4225,7 +3841,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
        if (!bpage)
                goto out;
-        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        reader = rb_get_reader_page(cpu_buffer);
        if (!reader)
@@ -4302,7 +3918,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
        } else {
                /* update the entry counter */
                cpu_buffer->read += rb_page_entries(reader);
-                cpu_buffer->read_bytes += BUF_PAGE_SIZE;
                /* swap the pages */
                rb_init_page(bpage);
@@ -4349,13 +3964,75 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
 out_unlock:
-        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 out:
        return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+#ifdef CONFIG_TRACING
+static ssize_t
+rb_simple_read(struct file *filp, char __user *ubuf,
+               size_t cnt, loff_t *ppos)
+{
+        unsigned long *p = filp->private_data;
+        char buf[64];
+        int r;
+        if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
+                r = sprintf(buf, "permanently disabled\n");
+        else
+                r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static ssize_t
+rb_simple_write(struct file *filp, const char __user *ubuf,
+                size_t cnt, loff_t *ppos)
+{
+        unsigned long *p = filp->private_data;
+        unsigned long val;
+        int ret;
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+        if (ret)
+                return ret;
+        if (val)
+                set_bit(RB_BUFFERS_ON_BIT, p);
+        else
+                clear_bit(RB_BUFFERS_ON_BIT, p);
+        (*ppos)++;
+        return cnt;
+}
+static const struct file_operations rb_simple_fops = {
+        .open           = tracing_open_generic,
+        .read           = rb_simple_read,
+        .write          = rb_simple_write,
+        .llseek         = default_llseek,
+};
+static __init int rb_init_debugfs(void)
+{
+        struct dentry *d_tracer;
+        d_tracer = tracing_init_dentry();
+        trace_create_file("tracing_on", 0644, d_tracer,
+                            &ring_buffer_flags, &rb_simple_fops);
+        return 0;
+}
+fs_initcall(rb_init_debugfs);
+#endif
 #ifdef CONFIG_HOTPLUG_CPU
 static int rb_cpu_notify(struct notifier_block *self,
                         unsigned long action, void *hcpu)
@@ -4363,8 +4040,6 @@ static int rb_cpu_notify(struct notifier_block *self,
        struct ring_buffer *buffer =
                container_of(self, struct ring_buffer, cpu_notify);
        long cpu = (long)hcpu;
-        int cpu_i, nr_pages_same;
-        unsigned int nr_pages;
        switch (action) {
        case CPU_UP_PREPARE:
@@ -4372,23 +4047,8 @@ static int rb_cpu_notify(struct notifier_block *self,
                if (cpumask_test_cpu(cpu, buffer->cpumask))
                        return NOTIFY_OK;
-                nr_pages = 0;
-                nr_pages_same = 1;
-                /* check if all cpu sizes are same */
-                for_each_buffer_cpu(buffer, cpu_i) {
-                        /* fill in the size from first enabled cpu */
-                        if (nr_pages == 0)
-                                nr_pages = buffer->buffers[cpu_i]->nr_pages;
-                        if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
-                                nr_pages_same = 0;
-                                break;
-                        }
-                }
-                /* allocate minimum pages, user can later expand it */
-                if (!nr_pages_same)
-                        nr_pages = 2;
                buffer->buffers[cpu] =
-                        rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
+                        rb_allocate_cpu_buffer(buffer, cpu);
                if (!buffer->buffers[cpu]) {
                        WARN(1, "failed to allocate ring buffer on CPU %ld\n",
                             cpu);
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c
deleted file mode 100644
index 4b3b5eaf94d..00000000000
--- a/kernel/trace/rpm-traces.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Power trace points
- *
- * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com>
- */
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/workqueue.h>
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/usb.h>
-#define CREATE_TRACE_POINTS
-#include <trace/events/rpm.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int);
-EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle);
-EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend);
-EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3c13e46d7d2..17a2d44e1af 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9,7 +9,7 @@
 *
 * Based on code from the latency_tracer, that is:
 *  Copyright (C) 2004-2006 Ingo Molnar
- *  Copyright (C) 2004 Nadia Yvette Chambers
+ *  Copyright (C) 2004 William Lee Irwin III
 */
 #include <linux/ring_buffer.h>
 #include <generated/utsrelease.h>
@@ -19,7 +19,6 @@
 #include <linux/seq_file.h>
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
-#include <linux/irq_work.h>
 #include <linux/debugfs.h>
 #include <linux/pagemap.h>
 #include <linux/hardirq.h>
@@ -37,7 +36,6 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
-#include <linux/nmi.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -79,21 +77,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
 }
 /*
- * To prevent the comm cache from being overwritten when no
- * tracing is active, only save the comm when a trace event
- * occurred.
- */
-static DEFINE_PER_CPU(bool, trace_cmdline_save);
-/*
- * When a reader is waiting for data, then this variable is
- * set to true.
- */
-static bool trace_wakeup_needed;
-static struct irq_work trace_work_wakeup;
-/*
 * Kill all tracing for good (never come back).
 * It is initialized to 1 but will turn to zero if the initialization
 * of the tracer is successful. But that is the only place that sets
@@ -103,6 +86,18 @@ static int tracing_disabled = 1;
 DEFINE_PER_CPU(int, ftrace_cpu_disabled);
+static inline void ftrace_disable_cpu(void)
+{
+        preempt_disable();
+        __this_cpu_inc(ftrace_cpu_disabled);
+}
+static inline void ftrace_enable_cpu(void)
+{
+        __this_cpu_dec(ftrace_cpu_disabled);
+        preempt_enable();
+}
 cpumask_var_t __read_mostly     tracing_buffer_mask;
 /*
@@ -155,18 +150,6 @@ static int __init set_ftrace_dump_on_oops(char *str)
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
-static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
-static char *trace_boot_options __initdata;
-static int __init set_trace_boot_options(char *str)
-{
-        strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
-        trace_boot_options = trace_boot_options_buf;
-        return 0;
-}
-__setup("trace_options=", set_trace_boot_options);
 unsigned long long ns2usecs(cycle_t nsec)
 {
        nsec += 500;
@@ -226,9 +209,20 @@ static struct trace_array	max_tr;
 static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
+/* tracer_enabled is used to toggle activation of a tracer */
+static int                      tracer_enabled = 1;
+/**
+ * tracing_is_enabled - return tracer_enabled status
+ *
+ * This function is used by other tracers to know the status
+ * of the tracer_enabled flag.  Tracers may use this function
+ * to know if it should enable their features when starting
+ * up. See irqsoff tracer for an example (start_irqsoff_tracer).
+ */
 int tracing_is_enabled(void)
 {
-        return tracing_is_on();
+        return tracer_enabled;
 }
 /*
@@ -344,77 +338,33 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
        TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
+        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
-        TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS;
 static int trace_stop_count;
-static DEFINE_RAW_SPINLOCK(tracing_start_lock);
+static DEFINE_SPINLOCK(tracing_start_lock);
-/**
+static void wakeup_work_handler(struct work_struct *work)
- * trace_wake_up - wake up tasks waiting for trace input
- *
- * Schedules a delayed work to wake up any task that is blocked on the
- * trace_wait queue. These is used with trace_poll for tasks polling the
- * trace.
- */
-static void trace_wake_up(struct irq_work *work)
 {
-        wake_up_all(&trace_wait);
+        wake_up(&trace_wait);
 }
-/**
+static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
- * tracing_on - enable tracing buffers
- *
- * This function enables tracing buffers that may have been
- * disabled with tracing_off.
- */
-void tracing_on(void)
-{
-        if (global_trace.buffer)
-                ring_buffer_record_on(global_trace.buffer);
-        /*
-         * This flag is only looked at when buffers haven't been
-         * allocated yet. We don't really care about the race
-         * between setting this flag and actually turning
-         * on the buffer.
-         */
-        global_trace.buffer_disabled = 0;
-}
-EXPORT_SYMBOL_GPL(tracing_on);
 /**
- * tracing_off - turn off tracing buffers
+ * trace_wake_up - wake up tasks waiting for trace input
 *
- * This function stops the tracing buffers from recording data.
+ * Schedules a delayed work to wake up any task that is blocked on the
- * It does not disable any overhead the tracers themselves may
+ * trace_wait queue. These is used with trace_poll for tasks polling the
- * be causing. This function simply causes all recording to
+ * trace.
- * the ring buffers to fail.
 */
-void tracing_off(void)
+void trace_wake_up(void)
 {
-        if (global_trace.buffer)
+        const unsigned long delay = msecs_to_jiffies(2);
-                ring_buffer_record_off(global_trace.buffer);
-        /*
-         * This flag is only looked at when buffers haven't been
-         * allocated yet. We don't really care about the race
-         * between setting this flag and actually turning
-         * on the buffer.
-         */
-        global_trace.buffer_disabled = 1;
-}
-EXPORT_SYMBOL_GPL(tracing_off);
-/**
+        if (trace_flags & TRACE_ITER_BLOCK)
- * tracing_is_on - show state of ring buffers enabled
+                return;
- */
+        schedule_delayed_work(&wakeup_work, delay);
-int tracing_is_on(void)
-{
-        if (global_trace.buffer)
-                return ring_buffer_record_is_on(global_trace.buffer);
-        return !global_trace.buffer_disabled;
 }
-EXPORT_SYMBOL_GPL(tracing_is_on);
 static int __init set_buf_size(char *str)
 {
@@ -433,15 +383,15 @@ __setup("trace_buf_size=", set_buf_size);
 static int __init set_tracing_thresh(char *str)
 {
-        unsigned long threshold;
+        unsigned long threshhold;
        int ret;
        if (!str)
                return 0;
-        ret = kstrtoul(str, 0, &threshold);
+        ret = strict_strtoul(str, 0, &threshhold);
        if (ret < 0)
                return 0;
-        tracing_thresh = threshold * 1000;
+        tracing_thresh = threshhold * 1000;
        return 1;
 }
 __setup("tracing_thresh=", set_tracing_thresh);
@@ -476,20 +426,15 @@ static const char *trace_options[] = {
        "record-cmd",
        "overwrite",
        "disable_on_free",
-        "irq-info",
-        "markers",
        NULL
 };
 static struct {
        u64 (*func)(void);
        const char *name;
-        int in_ns;              /* is this clock in nanoseconds? */
 } trace_clocks[] = {
-        { trace_clock_local,    "local",        1 },
+        { trace_clock_local,    "local" },
-        { trace_clock_global,   "global",       1 },
+        { trace_clock_global,   "global" },
-        { trace_clock_counter,  "counter",      0 },
-        ARCH_TRACE_CLOCKS
 };
 int trace_clock_id;
@@ -627,6 +572,7 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 {
        int len;
+        void *ret;
        if (s->len <= s->readpos)
                return -EBUSY;
@@ -634,7 +580,9 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
        len = s->len - s->readpos;
        if (cnt > len)
                cnt = len;
-        memcpy(buf, s->buffer + s->readpos, cnt);
+        ret = memcpy(buf, s->buffer + s->readpos, cnt);
+        if (!ret)
+                return -EFAULT;
        s->readpos += cnt;
        return cnt;
@@ -746,6 +694,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
        arch_spin_lock(&ftrace_max_lock);
+        ftrace_disable_cpu();
        ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
        if (ret == -EBUSY) {
@@ -759,6 +709,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
                        "Failed to swap buffers due to commit in progress\n");
        }
+        ftrace_enable_cpu();
        WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
        __update_max_tr(tr, tsk, cpu);
@@ -766,40 +718,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 }
 #endif /* CONFIG_TRACER_MAX_TRACE */
-static void default_wait_pipe(struct trace_iterator *iter)
-{
-        DEFINE_WAIT(wait);
-        prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
-        /*
-         * The events can happen in critical sections where
-         * checking a work queue can cause deadlocks.
-         * After adding a task to the queue, this flag is set
-         * only to notify events to try to wake up the queue
-         * using irq_work.
-         *
-         * We don't clear it even if the buffer is no longer
-         * empty. The flag only causes the next event to run
-         * irq_work to do the work queue wake up. The worse
-         * that can happen if we race with !trace_empty() is that
-         * an event will cause an irq_work to try to wake up
-         * an empty queue.
-         *
-         * There's no reason to protect this flag either, as
-         * the work queue and irq_work logic will do the necessary
-         * synchronization for the wake ups. The only thing
-         * that is necessary is that the wake up happens after
-         * a task has been queued. It's OK for spurious wake ups.
-         */
-        trace_wakeup_needed = true;
-        if (trace_empty(iter))
-                schedule();
-        finish_wait(&trace_wait, &wait);
-}
 /**
 * register_tracer - register a tracer with the ftrace system.
 * @type - the plugin for the tracer
@@ -807,6 +725,8 @@ static void default_wait_pipe(struct trace_iterator *iter)
 * Register a new plugin tracer.
 */
 int register_tracer(struct tracer *type)
+__releases(kernel_lock)
+__acquires(kernel_lock)
 {
        struct tracer *t;
        int ret = 0;
@@ -864,8 +784,7 @@ int register_tracer(struct tracer *type)
                /* If we expanded the buffers, make sure the max is expanded too */
                if (ring_buffer_expanded && type->use_max_tr)
-                        ring_buffer_resize(max_tr.buffer, trace_buf_size,
+                        ring_buffer_resize(max_tr.buffer, trace_buf_size);
-                                                RING_BUFFER_ALL_CPUS);
                /* the test is responsible for initializing and enabling */
                pr_info("Testing tracer %s: ", type->name);
@@ -874,8 +793,6 @@ int register_tracer(struct tracer *type)
                current_trace = saved_tracer;
                if (ret) {
                        printk(KERN_CONT "FAILED!\n");
-                        /* Add the warning after printing 'FAILED' */
-                        WARN_ON(1);
                        goto out;
                }
                /* Only reset on passing, to avoid touching corrupted buffers */
@@ -883,8 +800,7 @@ int register_tracer(struct tracer *type)
                /* Shrink the max buffer again */
                if (ring_buffer_expanded && type->use_max_tr)
-                        ring_buffer_resize(max_tr.buffer, 1,
+                        ring_buffer_resize(max_tr.buffer, 1);
-                                                RING_BUFFER_ALL_CPUS);
                printk(KERN_CONT "PASSED\n");
        }
@@ -918,6 +834,39 @@ int register_tracer(struct tracer *type)
        return ret;
 }
+void unregister_tracer(struct tracer *type)
+{
+        struct tracer **t;
+        mutex_lock(&trace_types_lock);
+        for (t = &trace_types; *t; t = &(*t)->next) {
+                if (*t == type)
+                        goto found;
+        }
+        pr_info("Tracer %s not registered\n", type->name);
+        goto out;
+ found:
+        *t = (*t)->next;
+        if (type == current_trace && tracer_enabled) {
+                tracer_enabled = 0;
+                tracing_stop();
+                if (current_trace->stop)
+                        current_trace->stop(&global_trace);
+                current_trace = &nop_trace;
+        }
+out:
+        mutex_unlock(&trace_types_lock);
+}
+static void __tracing_reset(struct ring_buffer *buffer, int cpu)
+{
+        ftrace_disable_cpu();
+        ring_buffer_reset_cpu(buffer, cpu);
+        ftrace_enable_cpu();
+}
 void tracing_reset(struct trace_array *tr, int cpu)
 {
        struct ring_buffer *buffer = tr->buffer;
@@ -926,7 +875,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
        /* Make sure all commits have finished */
        synchronize_sched();
-        ring_buffer_reset_cpu(buffer, cpu);
+        __tracing_reset(buffer, cpu);
        ring_buffer_record_enable(buffer);
 }
@@ -944,7 +893,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
        tr->time_start = ftrace_now(tr->cpu);
        for_each_online_cpu(cpu)
-                ring_buffer_reset_cpu(buffer, cpu);
+                __tracing_reset(buffer, cpu);
        ring_buffer_record_enable(buffer);
 }
@@ -1011,7 +960,7 @@ void tracing_start(void)
        if (tracing_disabled)
                return;
-        raw_spin_lock_irqsave(&tracing_start_lock, flags);
+        spin_lock_irqsave(&tracing_start_lock, flags);
        if (--trace_stop_count) {
                if (trace_stop_count < 0) {
                        /* Someone screwed up their debugging */
@@ -1036,7 +985,7 @@ void tracing_start(void)
        ftrace_start();
 out:
-        raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
+        spin_unlock_irqrestore(&tracing_start_lock, flags);
 }
 /**
@@ -1051,7 +1000,7 @@ void tracing_stop(void)
        unsigned long flags;
        ftrace_stop();
-        raw_spin_lock_irqsave(&tracing_start_lock, flags);
+        spin_lock_irqsave(&tracing_start_lock, flags);
        if (trace_stop_count++)
                goto out;
@@ -1069,7 +1018,7 @@ void tracing_stop(void)
        arch_spin_unlock(&ftrace_max_lock);
 out:
-        raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
+        spin_unlock_irqrestore(&tracing_start_lock, flags);
 }
 void trace_stop_cmdline_recording(void);
@@ -1148,14 +1097,10 @@ void trace_find_cmdline(int pid, char comm[])
 void tracing_record_cmdline(struct task_struct *tsk)
 {
-        if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
+        if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled ||
+            !tracing_is_on())
                return;
-        if (!__this_cpu_read(trace_cmdline_save))
-                return;
-        __this_cpu_write(trace_cmdline_save, false);
        trace_save_cmdline(tsk);
 }
@@ -1199,36 +1144,27 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
        return event;
 }
-void
-__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
-{
-        __this_cpu_write(trace_cmdline_save, true);
-        if (trace_wakeup_needed) {
-                trace_wakeup_needed = false;
-                /* irq_work_queue() supplies it's own memory barriers */
-                irq_work_queue(&trace_work_wakeup);
-        }
-        ring_buffer_unlock_commit(buffer, event);
-}
 static inline void
 __trace_buffer_unlock_commit(struct ring_buffer *buffer,
                             struct ring_buffer_event *event,
-                             unsigned long flags, int pc)
+                             unsigned long flags, int pc,
+                             int wake)
 {
-        __buffer_unlock_commit(buffer, event);
+        ring_buffer_unlock_commit(buffer, event);
        ftrace_trace_stack(buffer, flags, 6, pc);
        ftrace_trace_userstack(buffer, flags, pc);
+        if (wake)
+                trace_wake_up();
 }
 void trace_buffer_unlock_commit(struct ring_buffer *buffer,
                                struct ring_buffer_event *event,
                                unsigned long flags, int pc)
 {
-        __trace_buffer_unlock_commit(buffer, event, flags, pc);
+        __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
 }
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
 struct ring_buffer_event *
 trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
@@ -1245,21 +1181,29 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
                                        struct ring_buffer_event *event,
                                        unsigned long flags, int pc)
 {
-        __trace_buffer_unlock_commit(buffer, event, flags, pc);
+        __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
 }
 EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
-void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer,
+void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
-                                     struct ring_buffer_event *event,
+                                       struct ring_buffer_event *event,
-                                     unsigned long flags, int pc,
+                                       unsigned long flags, int pc)
-                                     struct pt_regs *regs)
+{
+        __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
+}
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
+void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
+                                            struct ring_buffer_event *event,
+                                            unsigned long flags, int pc,
+                                            struct pt_regs *regs)
 {
-        __buffer_unlock_commit(buffer, event);
+        ring_buffer_unlock_commit(buffer, event);
        ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
        ftrace_trace_userstack(buffer, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs);
 void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
                                         struct ring_buffer_event *event)
@@ -1291,7 +1235,7 @@ trace_function(struct trace_array *tr,
        entry->parent_ip                = parent_ip;
        if (!filter_check_discard(call, entry, buffer, event))
-                __buffer_unlock_commit(buffer, event);
+                ring_buffer_unlock_commit(buffer, event);
 }
 void
@@ -1384,7 +1328,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
        entry->size = trace.nr_entries;
        if (!filter_check_discard(call, entry, buffer, event))
-                __buffer_unlock_commit(buffer, event);
+                ring_buffer_unlock_commit(buffer, event);
 out:
        /* Again, don't let gcc optimize things here */
@@ -1480,7 +1424,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        save_stack_trace_user(&trace);
        if (!filter_check_discard(call, entry, buffer, event))
-                __buffer_unlock_commit(buffer, event);
+                ring_buffer_unlock_commit(buffer, event);
 out_drop_count:
        __this_cpu_dec(user_stack_count);
@@ -1497,150 +1441,25 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 #endif /* CONFIG_STACKTRACE */
-/* created for use with alloc_percpu */
-struct trace_buffer_struct {
-        char buffer[TRACE_BUF_SIZE];
-};
-static struct trace_buffer_struct *trace_percpu_buffer;
-static struct trace_buffer_struct *trace_percpu_sirq_buffer;
-static struct trace_buffer_struct *trace_percpu_irq_buffer;
-static struct trace_buffer_struct *trace_percpu_nmi_buffer;
-/*
- * The buffer used is dependent on the context. There is a per cpu
- * buffer for normal context, softirq contex, hard irq context and
- * for NMI context. Thise allows for lockless recording.
- *
- * Note, if the buffers failed to be allocated, then this returns NULL
- */
-static char *get_trace_buf(void)
-{
-        struct trace_buffer_struct *percpu_buffer;
-        struct trace_buffer_struct *buffer;
-        /*
-         * If we have allocated per cpu buffers, then we do not
-         * need to do any locking.
-         */
-        if (in_nmi())
-                percpu_buffer = trace_percpu_nmi_buffer;
-        else if (in_irq())
-                percpu_buffer = trace_percpu_irq_buffer;
-        else if (in_softirq())
-                percpu_buffer = trace_percpu_sirq_buffer;
-        else
-                percpu_buffer = trace_percpu_buffer;
-        if (!percpu_buffer)
-                return NULL;
-        buffer = per_cpu_ptr(percpu_buffer, smp_processor_id());
-        return buffer->buffer;
-}
-static int alloc_percpu_trace_buffer(void)
-{
-        struct trace_buffer_struct *buffers;
-        struct trace_buffer_struct *sirq_buffers;
-        struct trace_buffer_struct *irq_buffers;
-        struct trace_buffer_struct *nmi_buffers;
-        buffers = alloc_percpu(struct trace_buffer_struct);
-        if (!buffers)
-                goto err_warn;
-        sirq_buffers = alloc_percpu(struct trace_buffer_struct);
-        if (!sirq_buffers)
-                goto err_sirq;
-        irq_buffers = alloc_percpu(struct trace_buffer_struct);
-        if (!irq_buffers)
-                goto err_irq;
-        nmi_buffers = alloc_percpu(struct trace_buffer_struct);
-        if (!nmi_buffers)
-                goto err_nmi;
-        trace_percpu_buffer = buffers;
-        trace_percpu_sirq_buffer = sirq_buffers;
-        trace_percpu_irq_buffer = irq_buffers;
-        trace_percpu_nmi_buffer = nmi_buffers;
-        return 0;
- err_nmi:
-        free_percpu(irq_buffers);
- err_irq:
-        free_percpu(sirq_buffers);
- err_sirq:
-        free_percpu(buffers);
- err_warn:
-        WARN(1, "Could not allocate percpu trace_printk buffer");
-        return -ENOMEM;
-}
-static int buffers_allocated;
-void trace_printk_init_buffers(void)
-{
-        if (buffers_allocated)
-                return;
-        if (alloc_percpu_trace_buffer())
-                return;
-        pr_info("ftrace: Allocated trace_printk buffers\n");
-        /* Expand the buffers to set size */
-        tracing_update_buffers();
-        buffers_allocated = 1;
-        /*
-         * trace_printk_init_buffers() can be called by modules.
-         * If that happens, then we need to start cmdline recording
-         * directly here. If the global_trace.buffer is already
-         * allocated here, then this was called by module code.
-         */
-        if (global_trace.buffer)
-                tracing_start_cmdline_record();
-}
-void trace_printk_start_comm(void)
-{
-        /* Start tracing comms if trace printk is set */
-        if (!buffers_allocated)
-                return;
-        tracing_start_cmdline_record();
-}
-static void trace_printk_start_stop_comm(int enabled)
-{
-        if (!buffers_allocated)
-                return;
-        if (enabled)
-                tracing_start_cmdline_record();
-        else
-                tracing_stop_cmdline_record();
-}
 /**
 * trace_vbprintk - write binary msg to tracing buffer
 *
 */
 int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 {
+        static arch_spinlock_t trace_buf_lock =
+                (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+        static u32 trace_buf[TRACE_BUF_SIZE];
        struct ftrace_event_call *call = &event_bprint;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
        struct trace_array *tr = &global_trace;
+        struct trace_array_cpu *data;
        struct bprint_entry *entry;
        unsigned long flags;
-        char *tbuffer;
+        int disable;
-        int len = 0, size, pc;
+        int cpu, len = 0, size, pc;
        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;
@@ -1650,36 +1469,43 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        pc = preempt_count();
        preempt_disable_notrace();
+        cpu = raw_smp_processor_id();
+        data = tr->data[cpu];
-        tbuffer = get_trace_buf();
+        disable = atomic_inc_return(&data->disabled);
-        if (!tbuffer) {
+        if (unlikely(disable != 1))
-                len = 0;
                goto out;
-        }
-        len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
+        /* Lockdep uses trace_printk for lock tracing */
+        local_irq_save(flags);
+        arch_spin_lock(&trace_buf_lock);
+        len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-        if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
+        if (len > TRACE_BUF_SIZE || len < 0)
-                goto out;
+                goto out_unlock;
-        local_save_flags(flags);
        size = sizeof(*entry) + sizeof(u32) * len;
        buffer = tr->buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
                                          flags, pc);
        if (!event)
-                goto out;
+                goto out_unlock;
        entry = ring_buffer_event_data(event);
        entry->ip                       = ip;
        entry->fmt                      = fmt;
-        memcpy(entry->buf, tbuffer, sizeof(u32) * len);
+        memcpy(entry->buf, trace_buf, sizeof(u32) * len);
        if (!filter_check_discard(call, entry, buffer, event)) {
-                __buffer_unlock_commit(buffer, event);
+                ring_buffer_unlock_commit(buffer, event);
                ftrace_trace_stack(buffer, flags, 6, pc);
        }
+out_unlock:
+        arch_spin_unlock(&trace_buf_lock);
+        local_irq_restore(flags);
 out:
+        atomic_dec_return(&data->disabled);
        preempt_enable_notrace();
        unpause_graph_tracing();
@@ -1705,53 +1531,58 @@ int trace_array_printk(struct trace_array *tr,
 int trace_array_vprintk(struct trace_array *tr,
                        unsigned long ip, const char *fmt, va_list args)
 {
+        static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+        static char trace_buf[TRACE_BUF_SIZE];
        struct ftrace_event_call *call = &event_print;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
-        int len = 0, size, pc;
+        struct trace_array_cpu *data;
+        int cpu, len = 0, size, pc;
        struct print_entry *entry;
-        unsigned long flags;
+        unsigned long irq_flags;
-        char *tbuffer;
+        int disable;
        if (tracing_disabled || tracing_selftest_running)
                return 0;
-        /* Don't pollute graph traces with trace_vprintk internals */
-        pause_graph_tracing();
        pc = preempt_count();
        preempt_disable_notrace();
+        cpu = raw_smp_processor_id();
+        data = tr->data[cpu];
+        disable = atomic_inc_return(&data->disabled);
-        tbuffer = get_trace_buf();
+        if (unlikely(disable != 1))
-        if (!tbuffer) {
-                len = 0;
                goto out;
-        }
-        len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
+        pause_graph_tracing();
-        if (len > TRACE_BUF_SIZE)
+        raw_local_irq_save(irq_flags);
-                goto out;
+        arch_spin_lock(&trace_buf_lock);
+        len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-        local_save_flags(flags);
        size = sizeof(*entry) + len + 1;
        buffer = tr->buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
-                                          flags, pc);
+                                          irq_flags, pc);
        if (!event)
-                goto out;
+                goto out_unlock;
        entry = ring_buffer_event_data(event);
        entry->ip = ip;
-        memcpy(&entry->buf, tbuffer, len);
+        memcpy(&entry->buf, trace_buf, len);
        entry->buf[len] = '\0';
        if (!filter_check_discard(call, entry, buffer, event)) {
-                __buffer_unlock_commit(buffer, event);
+                ring_buffer_unlock_commit(buffer, event);
-                ftrace_trace_stack(buffer, flags, 6, pc);
+                ftrace_trace_stack(buffer, irq_flags, 6, pc);
        }
+ out_unlock:
+        arch_spin_unlock(&trace_buf_lock);
+        raw_local_irq_restore(irq_flags);
+        unpause_graph_tracing();
 out:
+        atomic_dec_return(&data->disabled);
        preempt_enable_notrace();
-        unpause_graph_tracing();
        return len;
 }
@@ -1764,11 +1595,14 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
 static void trace_iterator_increment(struct trace_iterator *iter)
 {
-        struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);
+        /* Don't allow ftrace to trace into the ring buffers */
+        ftrace_disable_cpu();
        iter->idx++;
-        if (buf_iter)
+        if (iter->buffer_iter[iter->cpu])
-                ring_buffer_read(buf_iter, NULL);
+                ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+        ftrace_enable_cpu();
 }
 static struct trace_entry *
@@ -1776,7 +1610,10 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
                unsigned long *lost_events)
 {
        struct ring_buffer_event *event;
-        struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);
+        struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
+        /* Don't allow ftrace to trace into the ring buffers */
+        ftrace_disable_cpu();
        if (buf_iter)
                event = ring_buffer_iter_peek(buf_iter, ts);
@@ -1784,6 +1621,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
                event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
                                         lost_events);
+        ftrace_enable_cpu();
        if (event) {
                iter->ent_size = ring_buffer_event_length(event);
                return ring_buffer_event_data(event);
@@ -1802,7 +1641,6 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
        int cpu_file = iter->cpu_file;
        u64 next_ts = 0, ts;
        int next_cpu = -1;
-        int next_size = 0;
        int cpu;
        /*
@@ -1834,12 +1672,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
                        next_cpu = cpu;
                        next_ts = ts;
                        next_lost = lost_events;
-                        next_size = iter->ent_size;
                }
        }
-        iter->ent_size = next_size;
        if (ent_cpu)
                *ent_cpu = next_cpu;
@@ -1873,8 +1708,11 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
 static void trace_consume(struct trace_iterator *iter)
 {
+        /* Don't allow ftrace to trace into the ring buffers */
+        ftrace_disable_cpu();
        ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
                            &iter->lost_events);
+        ftrace_enable_cpu();
 }
 static void *s_next(struct seq_file *m, void *v, loff_t *pos)
@@ -1914,10 +1752,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
        tr->data[cpu]->skipped_entries = 0;
-        buf_iter = trace_buffer_iter(iter, cpu);
+        if (!iter->buffer_iter[cpu])
-        if (!buf_iter)
                return;
+        buf_iter = iter->buffer_iter[cpu];
        ring_buffer_iter_reset(buf_iter);
        /*
@@ -1963,12 +1801,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
                iter->cpu = 0;
                iter->idx = -1;
+                ftrace_disable_cpu();
                if (cpu_file == TRACE_PIPE_ALL_CPU) {
                        for_each_tracing_cpu(cpu)
                                tracing_iter_reset(iter, cpu);
                } else
                        tracing_iter_reset(iter, cpu_file);
+                ftrace_enable_cpu();
                iter->leftover = 0;
                for (p = iter; p && l < *pos; p = s_next(m, p, &l))
                        ;
@@ -2000,33 +1842,6 @@ static void s_stop(struct seq_file *m, void *p)
        trace_event_read_unlock();
 }
-static void
-get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
-{
-        unsigned long count;
-        int cpu;
-        *total = 0;
-        *entries = 0;
-        for_each_tracing_cpu(cpu) {
-                count = ring_buffer_entries_cpu(tr->buffer, cpu);
-                /*
-                 * If this buffer has skipped entries, then we hold all
-                 * entries for the trace and we need to ignore the
-                 * ones before the time stamp.
-                 */
-                if (tr->data[cpu]->skipped_entries) {
-                        count -= tr->data[cpu]->skipped_entries;
-                        /* total is the same as the entries */
-                        *total += count;
-                } else
-                        *total += count +
-                                ring_buffer_overrun_cpu(tr->buffer, cpu);
-                *entries += count;
-        }
-}
 static void print_lat_help_header(struct seq_file *m)
 {
        seq_puts(m, "#                  _------=> CPU#            \n");
@@ -2039,35 +1854,12 @@ static void print_lat_help_header(struct seq_file *m)
        seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
 }
-static void print_event_info(struct trace_array *tr, struct seq_file *m)
+static void print_func_help_header(struct seq_file *m)
-{
-        unsigned long total;
-        unsigned long entries;
-        get_total_entries(tr, &total, &entries);
-        seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n",
-                   entries, total, num_online_cpus());
-        seq_puts(m, "#\n");
-}
-static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
 {
-        print_event_info(tr, m);
+        seq_puts(m, "#           TASK-PID    CPU#    TIMESTAMP  FUNCTION\n");
-        seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n");
        seq_puts(m, "#              | |       |          |         |\n");
 }
-static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
-{
-        print_event_info(tr, m);
-        seq_puts(m, "#                              _-----=> irqs-off\n");
-        seq_puts(m, "#                             / _----=> need-resched\n");
-        seq_puts(m, "#                            | / _---=> hardirq/softirq\n");
-        seq_puts(m, "#                            || / _--=> preempt-depth\n");
-        seq_puts(m, "#                            ||| /     delay\n");
-        seq_puts(m, "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n");
-        seq_puts(m, "#              | |       |   ||||       |         |\n");
-}
 void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
@@ -2076,14 +1868,32 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
        struct trace_array *tr = iter->tr;
        struct trace_array_cpu *data = tr->data[tr->cpu];
        struct tracer *type = current_trace;
-        unsigned long entries;
+        unsigned long entries = 0;
-        unsigned long total;
+        unsigned long total = 0;
+        unsigned long count;
        const char *name = "preemption";
+        int cpu;
        if (type)
                name = type->name;
-        get_total_entries(tr, &total, &entries);
+        for_each_tracing_cpu(cpu) {
+                count = ring_buffer_entries_cpu(tr->buffer, cpu);
+                /*
+                 * If this buffer has skipped entries, then we hold all
+                 * entries for the trace and we need to ignore the
+                 * ones before the time stamp.
+                 */
+                if (tr->data[cpu]->skipped_entries) {
+                        count -= tr->data[cpu]->skipped_entries;
+                        /* total is the same as the entries */
+                        total += count;
+                } else
+                        total += count +
+                                ring_buffer_overrun_cpu(tr->buffer, cpu);
+                entries += count;
+        }
        seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
                   name, UTS_RELEASE);
@@ -2114,8 +1924,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
        seq_puts(m, "#    -----------------\n");
        seq_printf(m, "#    | task: %.16s-%d "
                   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
-                   data->comm, data->pid,
+                   data->comm, data->pid, data->uid, data->nice,
-                   from_kuid_munged(seq_user_ns(m), data->uid), data->nice,
                   data->policy, data->rt_priority);
        seq_puts(m, "#    -----------------\n");
@@ -2264,15 +2073,13 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 int trace_empty(struct trace_iterator *iter)
 {
-        struct ring_buffer_iter *buf_iter;
        int cpu;
        /* If we are looking at one CPU buffer, only check that one */
        if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
                cpu = iter->cpu_file;
-                buf_iter = trace_buffer_iter(iter, cpu);
+                if (iter->buffer_iter[cpu]) {
-                if (buf_iter) {
+                        if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
-                        if (!ring_buffer_iter_empty(buf_iter))
                                return 0;
                } else {
                        if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2282,9 +2089,8 @@ int trace_empty(struct trace_iterator *iter)
        }
        for_each_tracing_cpu(cpu) {
-                buf_iter = trace_buffer_iter(iter, cpu);
+                if (iter->buffer_iter[cpu]) {
-                if (buf_iter) {
+                        if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
-                        if (!ring_buffer_iter_empty(buf_iter))
                                return 0;
                } else {
                        if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2333,21 +2139,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
        return print_trace_fmt(iter);
 }
-void trace_latency_header(struct seq_file *m)
-{
-        struct trace_iterator *iter = m->private;
-        /* print nothing if the buffers are empty */
-        if (trace_empty(iter))
-                return;
-        if (iter->iter_flags & TRACE_FILE_LAT_FMT)
-                print_trace_header(m, iter);
-        if (!(trace_flags & TRACE_ITER_VERBOSE))
-                print_lat_help_header(m);
-}
 void trace_default_header(struct seq_file *m)
 {
        struct trace_iterator *iter = m->private;
@@ -2363,23 +2154,11 @@ void trace_default_header(struct seq_file *m)
                if (!(trace_flags & TRACE_ITER_VERBOSE))
                        print_lat_help_header(m);
        } else {
-                if (!(trace_flags & TRACE_ITER_VERBOSE)) {
+                if (!(trace_flags & TRACE_ITER_VERBOSE))
-                        if (trace_flags & TRACE_ITER_IRQ_INFO)
+                        print_func_help_header(m);
-                                print_func_help_header_irq(iter->tr, m);
-                        else
-                                print_func_help_header(iter->tr, m);
-                }
        }
 }
-static void test_ftrace_alive(struct seq_file *m)
-{
-        if (!ftrace_is_dead())
-                return;
-        seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
-        seq_printf(m, "#          MAY BE MISSING FUNCTION EVENTS\n");
-}
 static int s_show(struct seq_file *m, void *v)
 {
        struct trace_iterator *iter = v;
@@ -2389,7 +2168,6 @@ static int s_show(struct seq_file *m, void *v)
                if (iter->tr) {
                        seq_printf(m, "# tracer: %s\n", iter->trace->name);
                        seq_puts(m, "#\n");
-                        test_ftrace_alive(m);
                }
                if (iter->trace && iter->trace->print_header)
                        iter->trace->print_header(m);
@@ -2433,21 +2211,18 @@ static struct trace_iterator *
 __tracing_open(struct inode *inode, struct file *file)
 {
        long cpu_file = (long) inode->i_private;
+        void *fail_ret = ERR_PTR(-ENOMEM);
        struct trace_iterator *iter;
-        int cpu;
+        struct seq_file *m;
+        int cpu, ret;
        if (tracing_disabled)
                return ERR_PTR(-ENODEV);
-        iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter));
+        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter)
                return ERR_PTR(-ENOMEM);
-        iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
-                                    GFP_KERNEL);
-        if (!iter->buffer_iter)
-                goto release;
        /*
         * We make a copy of the current tracer to avoid concurrent
         * changes on it while we are reading.
@@ -2479,10 +2254,6 @@ __tracing_open(struct inode *inode, struct file *file)
        if (ring_buffer_overruns(iter->tr->buffer))
                iter->iter_flags |= TRACE_FILE_ANNOTATE;
-        /* Output in nanoseconds only if we are using a clock in nanoseconds. */
-        if (trace_clocks[trace_clock_id].in_ns)
-                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
        /* stop the trace while dumping */
        tracing_stop();
@@ -2505,17 +2276,32 @@ __tracing_open(struct inode *inode, struct file *file)
                tracing_iter_reset(iter, cpu);
        }
+        ret = seq_open(file, &tracer_seq_ops);
+        if (ret < 0) {
+                fail_ret = ERR_PTR(ret);
+                goto fail_buffer;
+        }
+        m = file->private_data;
+        m->private = iter;
        mutex_unlock(&trace_types_lock);
        return iter;
+ fail_buffer:
+        for_each_tracing_cpu(cpu) {
+                if (iter->buffer_iter[cpu])
+                        ring_buffer_read_finish(iter->buffer_iter[cpu]);
+        }
+        free_cpumask_var(iter->started);
+        tracing_start();
 fail:
        mutex_unlock(&trace_types_lock);
        kfree(iter->trace);
-        kfree(iter->buffer_iter);
+        kfree(iter);
-release:
-        seq_release_private(inode, file);
+        return fail_ret;
-        return ERR_PTR(-ENOMEM);
 }
 int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -2551,11 +2337,11 @@ static int tracing_release(struct inode *inode, struct file *file)
        tracing_start();
        mutex_unlock(&trace_types_lock);
+        seq_release(inode, file);
        mutex_destroy(&iter->mutex);
        free_cpumask_var(iter->started);
        kfree(iter->trace);
-        kfree(iter->buffer_iter);
+        kfree(iter);
-        seq_release_private(inode, file);
        return 0;
 }
@@ -2741,12 +2527,10 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
                if (cpumask_test_cpu(cpu, tracing_cpumask) &&
                                !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
                        atomic_inc(&global_trace.data[cpu]->disabled);
-                        ring_buffer_record_disable_cpu(global_trace.buffer, cpu);
                }
                if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
                                cpumask_test_cpu(cpu, tracing_cpumask_new)) {
                        atomic_dec(&global_trace.data[cpu]->disabled);
-                        ring_buffer_record_enable_cpu(global_trace.buffer, cpu);
                }
        }
        arch_spin_unlock(&ftrace_max_lock);
@@ -2851,19 +2635,26 @@ static void set_tracer_flags(unsigned int mask, int enabled)
        if (mask == TRACE_ITER_OVERWRITE)
                ring_buffer_change_overwrite(global_trace.buffer, enabled);
-        if (mask == TRACE_ITER_PRINTK)
-                trace_printk_start_stop_comm(enabled);
 }
-static int trace_set_options(char *option)
+static ssize_t
+tracing_trace_options_write(struct file *filp, const char __user *ubuf,
+                        size_t cnt, loff_t *ppos)
 {
+        char buf[64];
        char *cmp;
        int neg = 0;
-        int ret = 0;
+        int ret;
        int i;
-        cmp = strstrip(option);
+        if (cnt >= sizeof(buf))
+                return -EINVAL;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        cmp = strstrip(buf);
        if (strncmp(cmp, "no", 2) == 0) {
                neg = 1;
@@ -2882,27 +2673,10 @@ static int trace_set_options(char *option)
                mutex_lock(&trace_types_lock);
                ret = set_tracer_option(current_trace, cmp, neg);
                mutex_unlock(&trace_types_lock);
+                if (ret)
+                        return ret;
        }
-        return ret;
-}
-static ssize_t
-tracing_trace_options_write(struct file *filp, const char __user *ubuf,
-                        size_t cnt, loff_t *ppos)
-{
-        char buf[64];
-        if (cnt >= sizeof(buf))
-                return -EINVAL;
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        trace_set_options(buf);
        *ppos += cnt;
        return cnt;
@@ -2927,18 +2701,18 @@ static const char readme_msg[] =
        "tracing mini-HOWTO:\n\n"
        "# mount -t debugfs nodev /sys/kernel/debug\n\n"
        "# cat /sys/kernel/debug/tracing/available_tracers\n"
-        "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n"
+        "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
        "# cat /sys/kernel/debug/tracing/current_tracer\n"
        "nop\n"
-        "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n"
+        "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
        "# cat /sys/kernel/debug/tracing/current_tracer\n"
-        "wakeup\n"
+        "sched_switch\n"
        "# cat /sys/kernel/debug/tracing/trace_options\n"
        "noprint-parent nosym-offset nosym-addr noverbose\n"
        "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
-        "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n"
+        "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
        "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
-        "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n"
+        "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
 ;
 static ssize_t
@@ -3007,6 +2781,56 @@ static const struct file_operations tracing_saved_cmdlines_fops = {
 };
 static ssize_t
+tracing_ctrl_read(struct file *filp, char __user *ubuf,
+                  size_t cnt, loff_t *ppos)
+{
+        char buf[64];
+        int r;
+        r = sprintf(buf, "%u\n", tracer_enabled);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static ssize_t
+tracing_ctrl_write(struct file *filp, const char __user *ubuf,
+                   size_t cnt, loff_t *ppos)
+{
+        struct trace_array *tr = filp->private_data;
+        unsigned long val;
+        int ret;
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+        if (ret)
+                return ret;
+        val = !!val;
+        mutex_lock(&trace_types_lock);
+        if (tracer_enabled ^ val) {
+                /* Only need to warn if this is used to change the state */
+                WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
+                if (val) {
+                        tracer_enabled = 1;
+                        if (current_trace->start)
+                                current_trace->start(tr);
+                        tracing_start();
+                } else {
+                        tracer_enabled = 0;
+                        tracing_stop();
+                        if (current_trace->stop)
+                                current_trace->stop(tr);
+                }
+        }
+        mutex_unlock(&trace_types_lock);
+        *ppos += cnt;
+        return cnt;
+}
+static ssize_t
 tracing_set_trace_read(struct file *filp, char __user *ubuf,
                       size_t cnt, loff_t *ppos)
 {
@@ -3029,39 +2853,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
        return t->init(tr);
 }
-static void set_buffer_entries(struct trace_array *tr, unsigned long val)
+static int __tracing_resize_ring_buffer(unsigned long size)
-{
-        int cpu;
-        for_each_tracing_cpu(cpu)
-                tr->data[cpu]->entries = val;
-}
-/* resize @tr's buffer to the size of @size_tr's entries */
-static int resize_buffer_duplicate_size(struct trace_array *tr,
-                                        struct trace_array *size_tr, int cpu_id)
-{
-        int cpu, ret = 0;
-        if (cpu_id == RING_BUFFER_ALL_CPUS) {
-                for_each_tracing_cpu(cpu) {
-                        ret = ring_buffer_resize(tr->buffer,
-                                        size_tr->data[cpu]->entries, cpu);
-                        if (ret < 0)
-                                break;
-                        tr->data[cpu]->entries = size_tr->data[cpu]->entries;
-                }
-        } else {
-                ret = ring_buffer_resize(tr->buffer,
-                                        size_tr->data[cpu_id]->entries, cpu_id);
-                if (ret == 0)
-                        tr->data[cpu_id]->entries =
-                                size_tr->data[cpu_id]->entries;
-        }
-        return ret;
-}
-static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
 {
        int ret;
@@ -3072,21 +2864,19 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
         */
        ring_buffer_expanded = 1;
-        /* May be called before buffers are initialized */
+        ret = ring_buffer_resize(global_trace.buffer, size);
-        if (!global_trace.buffer)
-                return 0;
-        ret = ring_buffer_resize(global_trace.buffer, size, cpu);
        if (ret < 0)
                return ret;
        if (!current_trace->use_max_tr)
                goto out;
-        ret = ring_buffer_resize(max_tr.buffer, size, cpu);
+        ret = ring_buffer_resize(max_tr.buffer, size);
        if (ret < 0) {
-                int r = resize_buffer_duplicate_size(&global_trace,
+                int r;
-                                                     &global_trace, cpu);
+                r = ring_buffer_resize(global_trace.buffer,
+                                       global_trace.entries);
                if (r < 0) {
                        /*
                         * AARGH! We are left with different
@@ -3108,39 +2898,43 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
                return ret;
        }
-        if (cpu == RING_BUFFER_ALL_CPUS)
+        max_tr.entries = size;
-                set_buffer_entries(&max_tr, size);
-        else
-                max_tr.data[cpu]->entries = size;
 out:
-        if (cpu == RING_BUFFER_ALL_CPUS)
+        global_trace.entries = size;
-                set_buffer_entries(&global_trace, size);
-        else
-                global_trace.data[cpu]->entries = size;
        return ret;
 }
-static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
+static ssize_t tracing_resize_ring_buffer(unsigned long size)
 {
-        int ret = size;
+        int cpu, ret = size;
        mutex_lock(&trace_types_lock);
-        if (cpu_id != RING_BUFFER_ALL_CPUS) {
+        tracing_stop();
-                /* make sure, this cpu is enabled in the mask */
-                if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
+        /* disable all cpu buffers */
-                        ret = -EINVAL;
+        for_each_tracing_cpu(cpu) {
-                        goto out;
+                if (global_trace.data[cpu])
-                }
+                        atomic_inc(&global_trace.data[cpu]->disabled);
+                if (max_tr.data[cpu])
+                        atomic_inc(&max_tr.data[cpu]->disabled);
        }
-        ret = __tracing_resize_ring_buffer(size, cpu_id);
+        if (size != global_trace.entries)
+                ret = __tracing_resize_ring_buffer(size);
        if (ret < 0)
                ret = -ENOMEM;
-out:
+        for_each_tracing_cpu(cpu) {
+                if (global_trace.data[cpu])
+                        atomic_dec(&global_trace.data[cpu]->disabled);
+                if (max_tr.data[cpu])
+                        atomic_dec(&max_tr.data[cpu]->disabled);
+        }
+        tracing_start();
        mutex_unlock(&trace_types_lock);
        return ret;
@@ -3163,8 +2957,7 @@ int tracing_update_buffers(void)
        mutex_lock(&trace_types_lock);
        if (!ring_buffer_expanded)
-                ret = __tracing_resize_ring_buffer(trace_buf_size,
+                ret = __tracing_resize_ring_buffer(trace_buf_size);
-                                                RING_BUFFER_ALL_CPUS);
        mutex_unlock(&trace_types_lock);
        return ret;
@@ -3188,8 +2981,7 @@ static int tracing_set_tracer(const char *buf)
        mutex_lock(&trace_types_lock);
        if (!ring_buffer_expanded) {
-                ret = __tracing_resize_ring_buffer(trace_buf_size,
+                ret = __tracing_resize_ring_buffer(trace_buf_size);
-                                                RING_BUFFER_ALL_CPUS);
                if (ret < 0)
                        goto out;
                ret = 0;
@@ -3215,20 +3007,19 @@ static int tracing_set_tracer(const char *buf)
                 * The max_tr ring buffer has some state (e.g. ring->clock) and
                 * we want preserve it.
                 */
-                ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
+                ring_buffer_resize(max_tr.buffer, 1);
-                set_buffer_entries(&max_tr, 1);
+                max_tr.entries = 1;
        }
        destroy_trace_option_files(topts);
-        current_trace = &nop_trace;
+        current_trace = t;
-        topts = create_trace_option_files(t);
+        topts = create_trace_option_files(current_trace);
-        if (t->use_max_tr) {
+        if (current_trace->use_max_tr) {
-                /* we need to make per cpu buffer sizes equivalent */
+                ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
-                ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
-                                                   RING_BUFFER_ALL_CPUS);
                if (ret < 0)
                        goto out;
+                max_tr.entries = global_trace.entries;
        }
        if (t->init) {
@@ -3237,7 +3028,6 @@ static int tracing_set_tracer(const char *buf)
                        goto out;
        }
-        current_trace = t;
        trace_branch_enable(tr);
 out:
        mutex_unlock(&trace_types_lock);
@@ -3350,10 +3140,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
        if (trace_flags & TRACE_ITER_LATENCY_FMT)
                iter->iter_flags |= TRACE_FILE_LAT_FMT;
-        /* Output in nanoseconds only if we are using a clock in nanoseconds. */
-        if (trace_clocks[trace_clock_id].in_ns)
-                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
        iter->cpu_file = cpu_file;
        iter->tr = &global_trace;
        mutex_init(&iter->mutex);
@@ -3414,6 +3200,19 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
        }
 }
+void default_wait_pipe(struct trace_iterator *iter)
+{
+        DEFINE_WAIT(wait);
+        prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
+        if (trace_empty(iter))
+                schedule();
+        finish_wait(&trace_wait, &wait);
+}
 /*
 * This is a make-shift waitqueue.
 * A tracer might use this callback on some rare cases:
@@ -3462,7 +3261,7 @@ static int tracing_wait_pipe(struct file *filp)
                 *
                 * iter->pos will be 0 if we haven't read anything.
                 */
-                if (!tracing_is_enabled() && iter->pos)
+                if (!tracer_enabled && iter->pos)
                        break;
        }
@@ -3643,7 +3442,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                .pages          = pages_def,
                .partial        = partial_def,
                .nr_pages       = 0, /* This gets updated below. */
-                .nr_pages_max   = PIPE_DEF_BUFFERS,
                .flags          = flags,
                .ops            = &tracing_pipe_buf_ops,
                .spd_release    = tracing_spd_release_pipe,
@@ -3715,7 +3513,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        ret = splice_to_pipe(pipe, &spd);
 out:
-        splice_shrink_spd(&spd);
+        splice_shrink_spd(pipe, &spd);
        return ret;
 out_err:
@@ -3723,82 +3521,30 @@ out_err:
        goto out;
 }
-struct ftrace_entries_info {
-        struct trace_array      *tr;
-        int                     cpu;
-};
-static int tracing_entries_open(struct inode *inode, struct file *filp)
-{
-        struct ftrace_entries_info *info;
-        if (tracing_disabled)
-                return -ENODEV;
-        info = kzalloc(sizeof(*info), GFP_KERNEL);
-        if (!info)
-                return -ENOMEM;
-        info->tr = &global_trace;
-        info->cpu = (unsigned long)inode->i_private;
-        filp->private_data = info;
-        return 0;
-}
 static ssize_t
 tracing_entries_read(struct file *filp, char __user *ubuf,
                     size_t cnt, loff_t *ppos)
 {
-        struct ftrace_entries_info *info = filp->private_data;
+        struct trace_array *tr = filp->private_data;
-        struct trace_array *tr = info->tr;
+        char buf[96];
-        char buf[64];
+        int r;
-        int r = 0;
-        ssize_t ret;
        mutex_lock(&trace_types_lock);
+        if (!ring_buffer_expanded)
-        if (info->cpu == RING_BUFFER_ALL_CPUS) {
+                r = sprintf(buf, "%lu (expanded: %lu)\n",
-                int cpu, buf_size_same;
+                            tr->entries >> 10,
-                unsigned long size;
+                            trace_buf_size >> 10);
+        else
-                size = 0;
+                r = sprintf(buf, "%lu\n", tr->entries >> 10);
-                buf_size_same = 1;
-                /* check if all cpu sizes are same */
-                for_each_tracing_cpu(cpu) {
-                        /* fill in the size from first enabled cpu */
-                        if (size == 0)
-                                size = tr->data[cpu]->entries;
-                        if (size != tr->data[cpu]->entries) {
-                                buf_size_same = 0;
-                                break;
-                        }
-                }
-                if (buf_size_same) {
-                        if (!ring_buffer_expanded)
-                                r = sprintf(buf, "%lu (expanded: %lu)\n",
-                                            size >> 10,
-                                            trace_buf_size >> 10);
-                        else
-                                r = sprintf(buf, "%lu\n", size >> 10);
-                } else
-                        r = sprintf(buf, "X\n");
-        } else
-                r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10);
        mutex_unlock(&trace_types_lock);
-        ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-        return ret;
 }
 static ssize_t
 tracing_entries_write(struct file *filp, const char __user *ubuf,
                      size_t cnt, loff_t *ppos)
 {
-        struct ftrace_entries_info *info = filp->private_data;
        unsigned long val;
        int ret;
@@ -3813,7 +3559,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        /* value is in KB */
        val <<= 10;
-        ret = tracing_resize_ring_buffer(val, info->cpu);
+        ret = tracing_resize_ring_buffer(val);
        if (ret < 0)
                return ret;
@@ -3822,40 +3568,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
-static int
-tracing_entries_release(struct inode *inode, struct file *filp)
-{
-        struct ftrace_entries_info *info = filp->private_data;
-        kfree(info);
-        return 0;
-}
-static ssize_t
-tracing_total_entries_read(struct file *filp, char __user *ubuf,
-                                size_t cnt, loff_t *ppos)
-{
-        struct trace_array *tr = filp->private_data;
-        char buf[64];
-        int r, cpu;
-        unsigned long size = 0, expanded_size = 0;
-        mutex_lock(&trace_types_lock);
-        for_each_tracing_cpu(cpu) {
-                size += tr->data[cpu]->entries >> 10;
-                if (!ring_buffer_expanded)
-                        expanded_size += trace_buf_size >> 10;
-        }
-        if (ring_buffer_expanded)
-                r = sprintf(buf, "%lu\n", size);
-        else
-                r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
-        mutex_unlock(&trace_types_lock);
-        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
 static ssize_t
 tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
                          size_t cnt, loff_t *ppos)
@@ -3877,112 +3589,56 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
        if (trace_flags & TRACE_ITER_STOP_ON_FREE)
                tracing_off();
        /* resize the ring buffer to 0 */
-        tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS);
+        tracing_resize_ring_buffer(0);
        return 0;
 }
+static int mark_printk(const char *fmt, ...)
+{
+        int ret;
+        va_list args;
+        va_start(args, fmt);
+        ret = trace_vprintk(0, fmt, args);
+        va_end(args);
+        return ret;
+}
 static ssize_t
 tracing_mark_write(struct file *filp, const char __user *ubuf,
                                        size_t cnt, loff_t *fpos)
 {
-        unsigned long addr = (unsigned long)ubuf;
+        char *buf;
-        struct ring_buffer_event *event;
+        size_t written;
-        struct ring_buffer *buffer;
-        struct print_entry *entry;
-        unsigned long irq_flags;
-        struct page *pages[2];
-        void *map_page[2];
-        int nr_pages = 1;
-        ssize_t written;
-        int offset;
-        int size;
-        int len;
-        int ret;
-        int i;
        if (tracing_disabled)
                return -EINVAL;
-        if (!(trace_flags & TRACE_ITER_MARKERS))
-                return -EINVAL;
        if (cnt > TRACE_BUF_SIZE)
                cnt = TRACE_BUF_SIZE;
-        /*
+        buf = kmalloc(cnt + 2, GFP_KERNEL);
-         * Userspace is injecting traces into the kernel trace buffer.
+        if (buf == NULL)
-         * We want to be as non intrusive as possible.
+                return -ENOMEM;
-         * To do so, we do not want to allocate any special buffers
-         * or take any locks, but instead write the userspace data
-         * straight into the ring buffer.
-         *
-         * First we need to pin the userspace buffer into memory,
-         * which, most likely it is, because it just referenced it.
-         * But there's no guarantee that it is. By using get_user_pages_fast()
-         * and kmap_atomic/kunmap_atomic() we can get access to the
-         * pages directly. We then write the data directly into the
-         * ring buffer.
-         */
-        BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
-        /* check if we cross pages */
-        if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
-                nr_pages = 2;
-        offset = addr & (PAGE_SIZE - 1);
-        addr &= PAGE_MASK;
-        ret = get_user_pages_fast(addr, nr_pages, 0, pages);
-        if (ret < nr_pages) {
-                while (--ret >= 0)
-                        put_page(pages[ret]);
-                written = -EFAULT;
-                goto out;
-        }
-        for (i = 0; i < nr_pages; i++)
-                map_page[i] = kmap_atomic(pages[i]);
-        local_save_flags(irq_flags);
+        if (copy_from_user(buf, ubuf, cnt)) {
-        size = sizeof(*entry) + cnt + 2; /* possible \n added */
+                kfree(buf);
-        buffer = global_trace.buffer;
+                return -EFAULT;
-        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
-                                          irq_flags, preempt_count());
-        if (!event) {
-                /* Ring buffer disabled, return as if not open for write */
-                written = -EBADF;
-                goto out_unlock;
        }
+        if (buf[cnt-1] != '\n') {
-        entry = ring_buffer_event_data(event);
+                buf[cnt] = '\n';
-        entry->ip = _THIS_IP_;
+                buf[cnt+1] = '\0';
-        if (nr_pages == 2) {
-                len = PAGE_SIZE - offset;
-                memcpy(&entry->buf, map_page[0] + offset, len);
-                memcpy(&entry->buf[len], map_page[1], cnt - len);
-        } else
-                memcpy(&entry->buf, map_page[0] + offset, cnt);
-        if (entry->buf[cnt - 1] != '\n') {
-                entry->buf[cnt] = '\n';
-                entry->buf[cnt + 1] = '\0';
        } else
-                entry->buf[cnt] = '\0';
+                buf[cnt] = '\0';
-        __buffer_unlock_commit(buffer, event);
-        written = cnt;
+        written = mark_printk("%s", buf);
+        kfree(buf);
        *fpos += written;
- out_unlock:
+        /* don't tell userspace we wrote more - it might confuse them */
-        for (i = 0; i < nr_pages; i++){
+        if (written > cnt)
-                kunmap_atomic(map_page[i]);
+                written = cnt;
-                put_page(pages[i]);
-        }
- out:
        return written;
 }
@@ -4032,14 +3688,6 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
        if (max_tr.buffer)
                ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
-        /*
-         * New clock may not be consistent with the previous clock.
-         * Reset the buffer so that it doesn't have incomparable timestamps.
-         */
-        tracing_reset_online_cpus(&global_trace);
-        if (max_tr.buffer)
-                tracing_reset_online_cpus(&max_tr);
        mutex_unlock(&trace_types_lock);
        *fpos += cnt;
@@ -4061,6 +3709,13 @@ static const struct file_operations tracing_max_lat_fops = {
        .llseek         = generic_file_llseek,
 };
+static const struct file_operations tracing_ctrl_fops = {
+        .open           = tracing_open_generic,
+        .read           = tracing_ctrl_read,
+        .write          = tracing_ctrl_write,
+        .llseek         = generic_file_llseek,
+};
 static const struct file_operations set_tracer_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_set_trace_read,
@@ -4078,16 +3733,9 @@ static const struct file_operations tracing_pipe_fops = {
 };
 static const struct file_operations tracing_entries_fops = {
-        .open           = tracing_entries_open,
+        .open           = tracing_open_generic,
        .read           = tracing_entries_read,
        .write          = tracing_entries_write,
-        .release        = tracing_entries_release,
-        .llseek         = generic_file_llseek,
-};
-static const struct file_operations tracing_total_entries_fops = {
-        .open           = tracing_open_generic,
-        .read           = tracing_total_entries_read,
        .llseek         = generic_file_llseek,
 };
@@ -4217,6 +3865,12 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
        buf->private = 0;
 }
+static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
+                                 struct pipe_buffer *buf)
+{
+        return 1;
+}
 static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
                                struct pipe_buffer *buf)
 {
@@ -4232,7 +3886,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = {
        .unmap                  = generic_pipe_buf_unmap,
        .confirm                = generic_pipe_buf_confirm,
        .release                = buffer_pipe_buf_release,
-        .steal                  = generic_pipe_buf_steal,
+        .steal                  = buffer_pipe_buf_steal,
        .get                    = buffer_pipe_buf_get,
 };
@@ -4264,7 +3918,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        struct splice_pipe_desc spd = {
                .pages          = pages_def,
                .partial        = partial_def,
-                .nr_pages_max   = PIPE_DEF_BUFFERS,
                .flags          = flags,
                .ops            = &buffer_pipe_buf_ops,
                .spd_release    = buffer_spd_release,
@@ -4277,11 +3930,13 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                return -ENOMEM;
        if (*ppos & (PAGE_SIZE - 1)) {
+                WARN_ONCE(1, "Ftrace: previous read must page-align\n");
                ret = -EINVAL;
                goto out;
        }
        if (len & (PAGE_SIZE - 1)) {
+                WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
                if (len < PAGE_SIZE) {
                        ret = -EINVAL;
                        goto out;
@@ -4350,7 +4005,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        }
        ret = splice_to_pipe(pipe, &spd);
-        splice_shrink_spd(&spd);
+        splice_shrink_spd(pipe, &spd);
 out:
        return ret;
 }
@@ -4371,8 +4026,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
        struct trace_array *tr = &global_trace;
        struct trace_seq *s;
        unsigned long cnt;
-        unsigned long long t;
-        unsigned long usec_rem;
        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (!s)
@@ -4389,31 +4042,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
        cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
        trace_seq_printf(s, "commit overrun: %ld\n", cnt);
-        cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
-        trace_seq_printf(s, "bytes: %ld\n", cnt);
-        if (trace_clocks[trace_clock_id].in_ns) {
-                /* local or global for trace_clock */
-                t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
-                usec_rem = do_div(t, USEC_PER_SEC);
-                trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
-                                                                t, usec_rem);
-                t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
-                usec_rem = do_div(t, USEC_PER_SEC);
-                trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
-        } else {
-                /* counter or tsc mode for trace_clock */
-                trace_seq_printf(s, "oldest event ts: %llu\n",
-                                ring_buffer_oldest_event_ts(tr->buffer, cpu));
-                trace_seq_printf(s, "now ts: %llu\n",
-                                ring_buffer_time_stamp(tr->buffer, cpu));
-        }
-        cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
-        trace_seq_printf(s, "dropped events: %ld\n", cnt);
        count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
        kfree(s);
@@ -4520,9 +4148,6 @@ static void tracing_init_debugfs_percpu(long cpu)
        struct dentry *d_cpu;
        char cpu_dir[30]; /* 30 characters should be more than enough */
-        if (!d_percpu)
-                return;
        snprintf(cpu_dir, 30, "cpu%ld", cpu);
        d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
        if (!d_cpu) {
@@ -4543,9 +4168,6 @@ static void tracing_init_debugfs_percpu(long cpu)
        trace_create_file("stats", 0444, d_cpu,
                        (void *) cpu, &tracing_stats_fops);
-        trace_create_file("buffer_size_kb", 0444, d_cpu,
-                        (void *) cpu, &tracing_entries_fops);
 }
 #ifdef CONFIG_FTRACE_SELFTEST
@@ -4655,7 +4277,7 @@ static const struct file_operations trace_options_core_fops = {
 };
 struct dentry *trace_create_file(const char *name,
-                                 umode_t mode,
+                                 mode_t mode,
                                 struct dentry *parent,
                                 void *data,
                                 const struct file_operations *fops)
@@ -4784,64 +4406,6 @@ static __init void create_trace_options_dir(void)
                create_trace_option_core_file(trace_options[i], i);
 }
-static ssize_t
-rb_simple_read(struct file *filp, char __user *ubuf,
-               size_t cnt, loff_t *ppos)
-{
-        struct trace_array *tr = filp->private_data;
-        struct ring_buffer *buffer = tr->buffer;
-        char buf[64];
-        int r;
-        if (buffer)
-                r = ring_buffer_record_is_on(buffer);
-        else
-                r = 0;
-        r = sprintf(buf, "%d\n", r);
-        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-static ssize_t
-rb_simple_write(struct file *filp, const char __user *ubuf,
-                size_t cnt, loff_t *ppos)
-{
-        struct trace_array *tr = filp->private_data;
-        struct ring_buffer *buffer = tr->buffer;
-        unsigned long val;
-        int ret;
-        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-        if (ret)
-                return ret;
-        if (buffer) {
-                mutex_lock(&trace_types_lock);
-                if (val) {
-                        ring_buffer_record_on(buffer);
-                        if (current_trace->start)
-                                current_trace->start(tr);
-                } else {
-                        ring_buffer_record_off(buffer);
-                        if (current_trace->stop)
-                                current_trace->stop(tr);
-                }
-                mutex_unlock(&trace_types_lock);
-        }
-        (*ppos)++;
-        return cnt;
-}
-static const struct file_operations rb_simple_fops = {
-        .open           = tracing_open_generic,
-        .read           = rb_simple_read,
-        .write          = rb_simple_write,
-        .llseek         = default_llseek,
-};
 static __init int tracer_init_debugfs(void)
 {
        struct dentry *d_tracer;
@@ -4851,6 +4415,9 @@ static __init int tracer_init_debugfs(void)
        d_tracer = tracing_init_dentry();
+        trace_create_file("tracing_enabled", 0644, d_tracer,
+                        &global_trace, &tracing_ctrl_fops);
        trace_create_file("trace_options", 0644, d_tracer,
                        NULL, &tracing_iter_fops);
@@ -4881,10 +4448,7 @@ static __init int tracer_init_debugfs(void)
                        (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
        trace_create_file("buffer_size_kb", 0644, d_tracer,
-                        (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
+                        &global_trace, &tracing_entries_fops);
-        trace_create_file("buffer_total_size_kb", 0444, d_tracer,
-                        &global_trace, &tracing_total_entries_fops);
        trace_create_file("free_buffer", 0644, d_tracer,
                        &global_trace, &tracing_free_buffer_fops);
@@ -4898,9 +4462,6 @@ static __init int tracer_init_debugfs(void)
        trace_create_file("trace_clock", 0644, d_tracer, NULL,
                          &trace_clock_fops);
-        trace_create_file("tracing_on", 0644, d_tracer,
-                            &global_trace, &rb_simple_fops);
 #ifdef CONFIG_DYNAMIC_FTRACE
        trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
                        &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -5005,12 +4566,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
        tracing_off();
-        /* Did function tracer already get disabled? */
-        if (ftrace_is_dead()) {
-                printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
-                printk("#          MAY BE MISSING FUNCTION EVENTS\n");
-        }
        if (disable_tracing)
                ftrace_kill();
@@ -5073,7 +4628,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
                        if (ret != TRACE_TYPE_NO_CONSUME)
                                trace_consume(&iter);
                }
-                touch_nmi_watchdog();
                trace_printk_seq(&iter.seq);
        }
@@ -5104,7 +4658,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 {
        __ftrace_dump(true, oops_dump_mode);
 }
-EXPORT_SYMBOL_GPL(ftrace_dump);
 __init static int tracer_alloc_buffers(void)
 {
@@ -5120,11 +4673,6 @@ __init static int tracer_alloc_buffers(void)
        if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
                goto out_free_buffer_mask;
-        /* Only allocate trace_printk buffers if a trace_printk exists */
-        if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
-                /* Must be called before global_trace.buffer is allocated */
-                trace_printk_init_buffers();
        /* To save memory, keep the ring buffer size to its minimum */
        if (ring_buffer_expanded)
                ring_buf_size = trace_buf_size;
@@ -5143,8 +4691,7 @@ __init static int tracer_alloc_buffers(void)
                WARN_ON(1);
                goto out_free_cpumask;
        }
-        if (global_trace.buffer_disabled)
+        global_trace.entries = ring_buffer_size(global_trace.buffer);
-                tracing_off();
 #ifdef CONFIG_TRACER_MAX_TRACE
@@ -5155,6 +4702,7 @@ __init static int tracer_alloc_buffers(void)
                ring_buffer_free(global_trace.buffer);
                goto out_free_cpumask;
        }
+        max_tr.entries = 1;
 #endif
        /* Allocate the first page for all buffers */
@@ -5163,14 +4711,7 @@ __init static int tracer_alloc_buffers(void)
                max_tr.data[i] = &per_cpu(max_tr_data, i);
        }
-        set_buffer_entries(&global_trace,
-                           ring_buffer_size(global_trace.buffer, 0));
-#ifdef CONFIG_TRACER_MAX_TRACE
-        set_buffer_entries(&max_tr, 1);
-#endif
        trace_init_cmdlines();
-        init_irq_work(&trace_work_wakeup, trace_wake_up);
        register_tracer(&nop_trace);
        current_trace = &nop_trace;
@@ -5182,13 +4723,6 @@ __init static int tracer_alloc_buffers(void)
        register_die_notifier(&trace_die_notifier);
-        while (trace_boot_options) {
-                char *option;
-                option = strsep(&trace_boot_options, ",");
-                trace_set_options(option);
-        }
        return 0;
 out_free_cpumask:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c75d7988902..616846bcfee 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -56,23 +56,17 @@ enum trace_type {
 #define F_STRUCT(args...)               args
 #undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)     \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)     \
-        struct struct_name {                                            \
+        struct struct_name {                                    \
-                struct trace_entry      ent;                            \
+                struct trace_entry      ent;                    \
-                tstruct                                                 \
+                tstruct                                         \
        }
 #undef TP_ARGS
 #define TP_ARGS(args...)        args
 #undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter)
+#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
-#undef FTRACE_ENTRY_REG
-#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
-                         filter, regfn) \
-        FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
-                     filter)
 #include "trace_entries.h"
@@ -103,11 +97,6 @@ struct kretprobe_trace_entry_head {
        unsigned long           ret_ip;
 };
-struct uprobe_trace_entry_head {
-        struct trace_entry      ent;
-        unsigned long           ip;
-};
 /*
 * trace_flag_type is an enumeration that holds different
 * states when a trace occurs. These are:
@@ -136,7 +125,6 @@ struct trace_array_cpu {
        atomic_t                disabled;
        void                    *buffer_page;   /* ring buffer spare */
-        unsigned long           entries;
        unsigned long           saved_latency;
        unsigned long           critical_start;
        unsigned long           critical_end;
@@ -147,7 +135,7 @@ struct trace_array_cpu {
        unsigned long           skipped_entries;
        cycle_t                 preempt_timestamp;
        pid_t                   pid;
-        kuid_t                  uid;
+        uid_t                   uid;
        char                    comm[TASK_COMM_LEN];
 };
@@ -158,8 +146,8 @@ struct trace_array_cpu {
 */
 struct trace_array {
        struct ring_buffer      *buffer;
+        unsigned long           entries;
        int                     cpu;
-        int                     buffer_disabled;
        cycle_t                 time_start;
        struct task_struct      *waiter;
        struct trace_array_cpu  *data[NR_CPUS];
@@ -285,8 +273,8 @@ struct tracer {
        int                     (*set_flag)(u32 old_flags, u32 bit, int set);
        struct tracer           *next;
        struct tracer_flags     *flags;
-        bool                    print_max;
+        int                     print_max;
-        bool                    use_max_tr;
+        int                     use_max_tr;
 };
@@ -300,8 +288,6 @@ struct tracer {
 /* for function tracing recursion */
 #define TRACE_INTERNAL_BIT              (1<<11)
 #define TRACE_GLOBAL_BIT                (1<<12)
-#define TRACE_CONTROL_BIT               (1<<13)
 /*
 * Abuse of the trace_recursion.
 * As we need a way to maintain state if we are tracing the function
@@ -317,23 +303,16 @@ struct tracer {
 #define TRACE_PIPE_ALL_CPU      -1
-static inline struct ring_buffer_iter *
-trace_buffer_iter(struct trace_iterator *iter, int cpu)
-{
-        if (iter->buffer_iter && iter->buffer_iter[cpu])
-                return iter->buffer_iter[cpu];
-        return NULL;
-}
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
+void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset_online_cpus(struct trace_array *tr);
 void tracing_reset_current(int cpu);
 void tracing_reset_current_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *trace_create_file(const char *name,
-                                 umode_t mode,
+                                 mode_t mode,
                                 struct dentry *parent,
                                 void *data,
                                 const struct file_operations *fops);
@@ -348,6 +327,9 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
                          unsigned long len,
                          unsigned long flags,
                          int pc);
+void trace_buffer_unlock_commit(struct ring_buffer *buffer,
+                                struct ring_buffer_event *event,
+                                unsigned long flags, int pc);
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
                                                struct trace_array_cpu *data);
@@ -355,9 +337,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
                                          int *ent_cpu, u64 *ent_ts);
-void __buffer_unlock_commit(struct ring_buffer *buffer,
-                            struct ring_buffer_event *event);
 int trace_empty(struct trace_iterator *iter);
 void *trace_find_next_entry_inc(struct trace_iterator *iter);
@@ -366,6 +345,7 @@ void trace_init_global_iter(struct trace_iterator *iter);
 void tracing_iter_reset(struct trace_iterator *iter, int cpu);
+void default_wait_pipe(struct trace_iterator *iter);
 void poll_wait_pipe(struct trace_iterator *iter);
 void ftrace(struct trace_array *tr,
@@ -390,7 +370,6 @@ void trace_graph_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
                    unsigned long flags, int pc);
-void trace_latency_header(struct seq_file *m);
 void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
 int trace_empty(struct trace_iterator *iter);
@@ -405,7 +384,12 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr);
 void tracing_stop_sched_switch_record(void);
 void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
+void unregister_tracer(struct tracer *type);
 int is_tracing_stopped(void);
+enum trace_file_type {
+        TRACE_FILE_LAT_FMT      = 1,
+        TRACE_FILE_ANNOTATE     = 2,
+};
 extern cpumask_var_t __read_mostly tracing_buffer_mask;
@@ -465,11 +449,11 @@ extern void trace_find_cmdline(int pid, char comm[]);
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern unsigned long ftrace_update_tot_cnt;
-#endif
 #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
 extern int DYN_FTRACE_TEST_NAME(void);
 #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
 extern int DYN_FTRACE_TEST_NAME2(void);
+#endif
 extern int ring_buffer_expanded;
 extern bool tracing_selftest_disabled;
@@ -595,17 +579,13 @@ static inline int ftrace_trace_task(struct task_struct *task)
        return test_tsk_trace_trace(task);
 }
-extern int ftrace_is_dead(void);
 #else
 static inline int ftrace_trace_task(struct task_struct *task)
 {
        return 1;
 }
-static inline int ftrace_is_dead(void) { return 0; }
 #endif
-int ftrace_event_is_function(struct ftrace_event_call *call);
 /*
 * struct trace_parser - servers for reading the user input separated by spaces
 * @cont: set if the input is not complete - no final space char was found
@@ -672,8 +652,6 @@ enum trace_iterator_flags {
        TRACE_ITER_RECORD_CMD           = 0x100000,
        TRACE_ITER_OVERWRITE            = 0x200000,
        TRACE_ITER_STOP_ON_FREE         = 0x400000,
-        TRACE_ITER_IRQ_INFO             = 0x800000,
-        TRACE_ITER_MARKERS              = 0x1000000,
 };
 /*
@@ -783,8 +761,16 @@ struct filter_pred {
        filter_pred_fn_t        fn;
        u64                     val;
        struct regex            regex;
-        unsigned short          *ops;
+        /*
-        struct ftrace_event_field *field;
+         * Leaf nodes use field_name, ops is used by AND and OR
+         * nodes. The field_name is always freed when freeing a pred.
+         * We can overload field_name for ops and have it freed
+         * as well.
+         */
+        union {
+                char            *field_name;
+                unsigned short  *ops;
+        };
        int                     offset;
        int                     not;
        int                     op;
@@ -833,24 +819,13 @@ extern struct list_head ftrace_events;
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
-void trace_printk_init_buffers(void);
-void trace_printk_start_comm(void);
 #undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)     \
+#define FTRACE_ENTRY(call, struct_name, id, tstruct, print)             \
        extern struct ftrace_event_call                                 \
        __attribute__((__aligned__(4))) event_##call;
 #undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
+#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print)         \
-        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
-                     filter)
 #include "trace_entries.h"
-#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
-int perf_ftrace_event_register(struct ftrace_event_call *call,
-                               enum trace_reg type, void *data);
-#else
-#define perf_ftrace_event_register NULL
-#endif
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 95e96842ed2..8d3538b4ea5 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -77,7 +77,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
        entry->correct = val == expect;
        if (!filter_check_discard(call, entry, buffer, event))
-                __buffer_unlock_commit(buffer, event);
+                ring_buffer_unlock_commit(buffer, event);
 out:
        atomic_dec(&tr->data[cpu]->disabled);
@@ -199,7 +199,7 @@ __init static int init_branch_tracer(void)
        }
        return register_tracer(&branch_trace);
 }
-core_initcall(init_branch_tracer);
+device_initcall(init_branch_tracer);
 #else
 static inline
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 394783531cb..6302747a139 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -113,15 +113,3 @@ u64 notrace trace_clock_global(void)
        return now;
 }
-static atomic64_t trace_counter;
-/*
- * trace_clock_counter(): simply an atomic counter.
- * Use the trace_counter "counter" for cases where you do not care
- * about timings, but are interested in strict ordering.
- */
-u64 notrace trace_clock_counter(void)
-{
-        return atomic64_add_return(1, &trace_counter);
-}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 4108e1250ca..93365907f21 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -55,7 +55,7 @@
 /*
 * Function trace entry - function address and parent function address:
 */
-FTRACE_ENTRY_REG(function, ftrace_entry,
+FTRACE_ENTRY(function, ftrace_entry,
        TRACE_FN,
@@ -64,11 +64,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
                __field(        unsigned long,  parent_ip       )
        ),
-        F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip),
+        F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
-        FILTER_TRACE_FN,
-        perf_ftrace_event_register
 );
 /* Function call entry */
@@ -82,9 +78,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
                __field_desc(   int,            graph_ent,      depth           )
        ),
-        F_printk("--> %lx (%d)", __entry->func, __entry->depth),
+        F_printk("--> %lx (%d)", __entry->func, __entry->depth)
-        FILTER_OTHER
 );
 /* Function return entry */
@@ -104,9 +98,7 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
        F_printk("<-- %lx (%d) (start: %llx  end: %llx) over: %d",
                 __entry->func, __entry->depth,
                 __entry->calltime, __entry->rettime,
-                 __entry->depth),
+                 __entry->depth)
-        FILTER_OTHER
 );
 /*
@@ -135,9 +127,8 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,
        F_printk("%u:%u:%u  ==> %u:%u:%u [%03u]",
                 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
                 __entry->next_pid, __entry->next_prio, __entry->next_state,
-                 __entry->next_cpu),
+                 __entry->next_cpu
+                )
-        FILTER_OTHER
 );
 /*
@@ -155,9 +146,8 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
        F_printk("%u:%u:%u  ==+ %u:%u:%u [%03u]",
                 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
                 __entry->next_pid, __entry->next_prio, __entry->next_state,
-                 __entry->next_cpu),
+                 __entry->next_cpu
+                )
-        FILTER_OTHER
 );
 /*
@@ -166,12 +156,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
 #define FTRACE_STACK_ENTRIES    8
-#ifndef CONFIG_64BIT
-# define IP_FMT "%08lx"
-#else
-# define IP_FMT "%016lx"
-#endif
 FTRACE_ENTRY(kernel_stack, stack_entry,
        TRACE_STACK,
@@ -181,14 +165,11 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
                __dynamic_array(unsigned long,  caller  )
        ),
-        F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+        F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
-                 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+                 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
-                 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
                 __entry->caller[0], __entry->caller[1], __entry->caller[2],
                 __entry->caller[3], __entry->caller[4], __entry->caller[5],
-                 __entry->caller[6], __entry->caller[7]),
+                 __entry->caller[6], __entry->caller[7])
-        FILTER_OTHER
 );
 FTRACE_ENTRY(user_stack, userstack_entry,
@@ -200,14 +181,11 @@ FTRACE_ENTRY(user_stack, userstack_entry,
                __array(        unsigned long,  caller, FTRACE_STACK_ENTRIES    )
        ),
-        F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+        F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
-                 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+                 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
-                 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
                 __entry->caller[0], __entry->caller[1], __entry->caller[2],
                 __entry->caller[3], __entry->caller[4], __entry->caller[5],
-                 __entry->caller[6], __entry->caller[7]),
+                 __entry->caller[6], __entry->caller[7])
-        FILTER_OTHER
 );
 /*
@@ -224,9 +202,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
        ),
        F_printk("%08lx fmt:%p",
-                 __entry->ip, __entry->fmt),
+                 __entry->ip, __entry->fmt)
-        FILTER_OTHER
 );
 FTRACE_ENTRY(print, print_entry,
@@ -239,9 +215,7 @@ FTRACE_ENTRY(print, print_entry,
        ),
        F_printk("%08lx %s",
-                 __entry->ip, __entry->buf),
+                 __entry->ip, __entry->buf)
-        FILTER_OTHER
 );
 FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
@@ -260,9 +234,7 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
        F_printk("%lx %lx %lx %d %x %x",
                 (unsigned long)__entry->phys, __entry->value, __entry->pc,
-                 __entry->map_id, __entry->opcode, __entry->width),
+                 __entry->map_id, __entry->opcode, __entry->width)
-        FILTER_OTHER
 );
 FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
@@ -280,9 +252,7 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
        F_printk("%lx %lx %lx %d %x",
                 (unsigned long)__entry->phys, __entry->virt, __entry->len,
-                 __entry->map_id, __entry->opcode),
+                 __entry->map_id, __entry->opcode)
-        FILTER_OTHER
 );
@@ -302,8 +272,6 @@ FTRACE_ENTRY(branch, trace_branch,
        F_printk("%u:%s:%s (%u)",
                 __entry->line,
-                 __entry->func, __entry->file, __entry->correct),
+                 __entry->func, __entry->file, __entry->correct)
-        FILTER_OTHER
 );
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 84b1e045fab..19a359d5e6d 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,11 +24,6 @@ static int	total_ref_count;
 static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
                                 struct perf_event *p_event)
 {
-        /* The ftrace function trace is allowed only for root. */
-        if (ftrace_event_is_function(tp_event) &&
-            perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-                return -EPERM;
        /* No tracing, just counting, so no obvious leak */
        if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
                return 0;
@@ -49,17 +44,23 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
        return 0;
 }
-static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
+static int perf_trace_event_init(struct ftrace_event_call *tp_event,
-                                struct perf_event *p_event)
+                                 struct perf_event *p_event)
 {
        struct hlist_head __percpu *list;
-        int ret = -ENOMEM;
+        int ret;
        int cpu;
+        ret = perf_trace_event_perm(tp_event, p_event);
+        if (ret)
+                return ret;
        p_event->tp_event = tp_event;
        if (tp_event->perf_refcount++ > 0)
                return 0;
+        ret = -ENOMEM;
        list = alloc_percpu(struct hlist_head);
        if (!list)
                goto fail;
@@ -82,7 +83,7 @@ static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
                }
        }
-        ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
+        ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
        if (ret)
                goto fail;
@@ -107,69 +108,6 @@ fail:
        return ret;
 }
-static void perf_trace_event_unreg(struct perf_event *p_event)
-{
-        struct ftrace_event_call *tp_event = p_event->tp_event;
-        int i;
-        if (--tp_event->perf_refcount > 0)
-                goto out;
-        tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
-        /*
-         * Ensure our callback won't be called anymore. The buffers
-         * will be freed after that.
-         */
-        tracepoint_synchronize_unregister();
-        free_percpu(tp_event->perf_events);
-        tp_event->perf_events = NULL;
-        if (!--total_ref_count) {
-                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
-                        free_percpu(perf_trace_buf[i]);
-                        perf_trace_buf[i] = NULL;
-                }
-        }
-out:
-        module_put(tp_event->mod);
-}
-static int perf_trace_event_open(struct perf_event *p_event)
-{
-        struct ftrace_event_call *tp_event = p_event->tp_event;
-        return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
-}
-static void perf_trace_event_close(struct perf_event *p_event)
-{
-        struct ftrace_event_call *tp_event = p_event->tp_event;
-        tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
-}
-static int perf_trace_event_init(struct ftrace_event_call *tp_event,
-                                 struct perf_event *p_event)
-{
-        int ret;
-        ret = perf_trace_event_perm(tp_event, p_event);
-        if (ret)
-                return ret;
-        ret = perf_trace_event_reg(tp_event, p_event);
-        if (ret)
-                return ret;
-        ret = perf_trace_event_open(p_event);
-        if (ret) {
-                perf_trace_event_unreg(p_event);
-                return ret;
-        }
-        return 0;
-}
 int perf_trace_init(struct perf_event *p_event)
 {
        struct ftrace_event_call *tp_event;
@@ -192,14 +130,6 @@ int perf_trace_init(struct perf_event *p_event)
        return ret;
 }
-void perf_trace_destroy(struct perf_event *p_event)
-{
-        mutex_lock(&event_mutex);
-        perf_trace_event_close(p_event);
-        perf_trace_event_unreg(p_event);
-        mutex_unlock(&event_mutex);
-}
 int perf_trace_add(struct perf_event *p_event, int flags)
 {
        struct ftrace_event_call *tp_event = p_event->tp_event;
@@ -216,14 +146,43 @@ int perf_trace_add(struct perf_event *p_event, int flags)
        list = this_cpu_ptr(pcpu_list);
        hlist_add_head_rcu(&p_event->hlist_entry, list);
-        return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+        return 0;
 }
 void perf_trace_del(struct perf_event *p_event, int flags)
 {
-        struct ftrace_event_call *tp_event = p_event->tp_event;
        hlist_del_rcu(&p_event->hlist_entry);
-        tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+}
+void perf_trace_destroy(struct perf_event *p_event)
+{
+        struct ftrace_event_call *tp_event = p_event->tp_event;
+        int i;
+        mutex_lock(&event_mutex);
+        if (--tp_event->perf_refcount > 0)
+                goto out;
+        tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
+        /*
+         * Ensure our callback won't be called anymore. The buffers
+         * will be freed after that.
+         */
+        tracepoint_synchronize_unregister();
+        free_percpu(tp_event->perf_events);
+        tp_event->perf_events = NULL;
+        if (!--total_ref_count) {
+                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
+                        free_percpu(perf_trace_buf[i]);
+                        perf_trace_buf[i] = NULL;
+                }
+        }
+out:
+        module_put(tp_event->mod);
+        mutex_unlock(&event_mutex);
 }
 __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
@@ -255,87 +214,3 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
        return raw_data;
 }
 EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
-#ifdef CONFIG_FUNCTION_TRACER
-static void
-perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
-                          struct ftrace_ops *ops, struct pt_regs *pt_regs)
-{
-        struct ftrace_entry *entry;
-        struct hlist_head *head;
-        struct pt_regs regs;
-        int rctx;
-#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
-                    sizeof(u64)) - sizeof(u32))
-        BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
-        perf_fetch_caller_regs(&regs);
-        entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
-        if (!entry)
-                return;
-        entry->ip = ip;
-        entry->parent_ip = parent_ip;
-        head = this_cpu_ptr(event_function.perf_events);
-        perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
-                              1, &regs, head, NULL);
-#undef ENTRY_SIZE
-}
-static int perf_ftrace_function_register(struct perf_event *event)
-{
-        struct ftrace_ops *ops = &event->ftrace_ops;
-        ops->flags |= FTRACE_OPS_FL_CONTROL;
-        ops->func = perf_ftrace_function_call;
-        return register_ftrace_function(ops);
-}
-static int perf_ftrace_function_unregister(struct perf_event *event)
-{
-        struct ftrace_ops *ops = &event->ftrace_ops;
-        int ret = unregister_ftrace_function(ops);
-        ftrace_free_filter(ops);
-        return ret;
-}
-static void perf_ftrace_function_enable(struct perf_event *event)
-{
-        ftrace_function_local_enable(&event->ftrace_ops);
-}
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
-        ftrace_function_local_disable(&event->ftrace_ops);
-}
-int perf_ftrace_event_register(struct ftrace_event_call *call,
-                               enum trace_reg type, void *data)
-{
-        switch (type) {
-        case TRACE_REG_REGISTER:
-        case TRACE_REG_UNREGISTER:
-                break;
-        case TRACE_REG_PERF_REGISTER:
-        case TRACE_REG_PERF_UNREGISTER:
-                return 0;
-        case TRACE_REG_PERF_OPEN:
-                return perf_ftrace_function_register(data);
-        case TRACE_REG_PERF_CLOSE:
-                return perf_ftrace_function_unregister(data);
-        case TRACE_REG_PERF_ADD:
-                perf_ftrace_function_enable(data);
-                return 0;
-        case TRACE_REG_PERF_DEL:
-                perf_ftrace_function_disable(data);
-                return 0;
-        }
-        return -EINVAL;
-}
-#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 880073d0b94..c212a7f934e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -147,8 +147,7 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(trace_event_raw_init);
-int ftrace_event_reg(struct ftrace_event_call *call,
+int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
-                     enum trace_reg type, void *data)
 {
        switch (type) {
        case TRACE_REG_REGISTER:
@@ -171,11 +170,6 @@ int ftrace_event_reg(struct ftrace_event_call *call,
                                            call->class->perf_probe,
                                            call);
                return 0;
-        case TRACE_REG_PERF_OPEN:
-        case TRACE_REG_PERF_CLOSE:
-        case TRACE_REG_PERF_ADD:
-        case TRACE_REG_PERF_DEL:
-                return 0;
 #endif
        }
        return 0;
@@ -215,7 +209,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
                                tracing_stop_cmdline_record();
                                call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
                        }
-                        call->class->reg(call, TRACE_REG_UNREGISTER, NULL);
+                        call->class->reg(call, TRACE_REG_UNREGISTER);
                }
                break;
        case 1:
@@ -224,7 +218,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
                                tracing_start_cmdline_record();
                                call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
                        }
-                        ret = call->class->reg(call, TRACE_REG_REGISTER, NULL);
+                        ret = call->class->reg(call, TRACE_REG_REGISTER);
                        if (ret) {
                                tracing_stop_cmdline_record();
                                pr_info("event trace: Could not enable event "
@@ -294,9 +288,6 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
                if (!call->name || !call->class || !call->class->reg)
                        continue;
-                if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
-                        continue;
                if (match &&
                    strcmp(match, call->name) != 0 &&
                    strcmp(match, call->class->system) != 0)
@@ -491,6 +482,19 @@ static void t_stop(struct seq_file *m, void *p)
        mutex_unlock(&event_mutex);
 }
+static int
+ftrace_event_seq_open(struct inode *inode, struct file *file)
+{
+        const struct seq_operations *seq_ops;
+        if ((file->f_mode & FMODE_WRITE) &&
+            (file->f_flags & O_TRUNC))
+                ftrace_clear_events();
+        seq_ops = inode->i_private;
+        return seq_open(file, seq_ops);
+}
 static ssize_t
 event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                  loff_t *ppos)
@@ -967,9 +971,6 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
        return r;
 }
-static int ftrace_event_avail_open(struct inode *inode, struct file *file);
-static int ftrace_event_set_open(struct inode *inode, struct file *file);
 static const struct seq_operations show_event_seq_ops = {
        .start = t_start,
        .next = t_next,
@@ -985,14 +986,14 @@ static const struct seq_operations show_set_event_seq_ops = {
 };
 static const struct file_operations ftrace_avail_fops = {
-        .open = ftrace_event_avail_open,
+        .open = ftrace_event_seq_open,
        .read = seq_read,
        .llseek = seq_lseek,
        .release = seq_release,
 };
 static const struct file_operations ftrace_set_event_fops = {
-        .open = ftrace_event_set_open,
+        .open = ftrace_event_seq_open,
        .read = seq_read,
        .write = ftrace_event_write,
        .llseek = seq_lseek,
@@ -1068,26 +1069,6 @@ static struct dentry *event_trace_events_dir(void)
        return d_events;
 }
-static int
-ftrace_event_avail_open(struct inode *inode, struct file *file)
-{
-        const struct seq_operations *seq_ops = &show_event_seq_ops;
-        return seq_open(file, seq_ops);
-}
-static int
-ftrace_event_set_open(struct inode *inode, struct file *file)
-{
-        const struct seq_operations *seq_ops = &show_set_event_seq_ops;
-        if ((file->f_mode & FMODE_WRITE) &&
-            (file->f_flags & O_TRUNC))
-                ftrace_clear_events();
-        return seq_open(file, seq_ops);
-}
 static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
@@ -1177,7 +1158,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                return -1;
        }
-        if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
+        if (call->class->reg)
                trace_create_file("enable", 0644, call->dir, call,
                                  enable);
@@ -1209,31 +1190,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
        return 0;
 }
-static void event_remove(struct ftrace_event_call *call)
-{
-        ftrace_event_enable_disable(call, 0);
-        if (call->event.funcs)
-                __unregister_ftrace_event(&call->event);
-        list_del(&call->list);
-}
-static int event_init(struct ftrace_event_call *call)
-{
-        int ret = 0;
-        if (WARN_ON(!call->name))
-                return -EINVAL;
-        if (call->class->raw_init) {
-                ret = call->class->raw_init(call);
-                if (ret < 0 && ret != -ENOSYS)
-                        pr_warn("Could not initialize trace events/%s\n",
-                                call->name);
-        }
-        return ret;
-}
 static int
 __trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
                       const struct file_operations *id,
@@ -1244,9 +1200,19 @@ __trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
        struct dentry *d_events;
        int ret;
-        ret = event_init(call);
+        /* The linker may leave blanks */
-        if (ret < 0)
+        if (!call->name)
-                return ret;
+                return -EINVAL;
+        if (call->class->raw_init) {
+                ret = call->class->raw_init(call);
+                if (ret < 0) {
+                        if (ret != -ENOSYS)
+                                pr_warning("Could not initialize trace events/%s\n",
+                                           call->name);
+                        return ret;
+                }
+        }
        d_events = event_trace_events_dir();
        if (!d_events)
@@ -1297,10 +1263,13 @@ static void remove_subsystem_dir(const char *name)
 */
 static void __trace_remove_event_call(struct ftrace_event_call *call)
 {
-        event_remove(call);
+        ftrace_event_enable_disable(call, 0);
+        if (call->event.funcs)
+                __unregister_ftrace_event(&call->event);
+        debugfs_remove_recursive(call->dir);
+        list_del(&call->list);
        trace_destroy_fields(call);
        destroy_preds(call);
-        debugfs_remove_recursive(call->dir);
        remove_subsystem_dir(call->class->system);
 }
@@ -1472,59 +1441,30 @@ static __init int setup_trace_event(char *str)
 }
 __setup("trace_event=", setup_trace_event);
-static __init int event_trace_enable(void)
-{
-        struct ftrace_event_call **iter, *call;
-        char *buf = bootup_event_buf;
-        char *token;
-        int ret;
-        for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
-                call = *iter;
-                ret = event_init(call);
-                if (!ret)
-                        list_add(&call->list, &ftrace_events);
-        }
-        while (true) {
-                token = strsep(&buf, ",");
-                if (!token)
-                        break;
-                if (!*token)
-                        continue;
-                ret = ftrace_set_clr_event(token, 1);
-                if (ret)
-                        pr_warn("Failed to enable trace event: %s\n", token);
-        }
-        trace_printk_start_comm();
-        return 0;
-}
 static __init int event_trace_init(void)
 {
-        struct ftrace_event_call *call;
+        struct ftrace_event_call **call;
        struct dentry *d_tracer;
        struct dentry *entry;
        struct dentry *d_events;
        int ret;
+        char *buf = bootup_event_buf;
+        char *token;
        d_tracer = tracing_init_dentry();
        if (!d_tracer)
                return 0;
        entry = debugfs_create_file("available_events", 0444, d_tracer,
-                                    NULL, &ftrace_avail_fops);
+                                    (void *)&show_event_seq_ops,
+                                    &ftrace_avail_fops);
        if (!entry)
                pr_warning("Could not create debugfs "
                           "'available_events' entry\n");
        entry = debugfs_create_file("set_event", 0644, d_tracer,
-                                    NULL, &ftrace_set_event_fops);
+                                    (void *)&show_set_event_seq_ops,
+                                    &ftrace_set_event_fops);
        if (!entry)
                pr_warning("Could not create debugfs "
                           "'set_event' entry\n");
@@ -1548,19 +1488,24 @@ static __init int event_trace_init(void)
        if (trace_define_common_fields())
                pr_warning("tracing: Failed to allocate common fields");
-        /*
+        for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
-         * Early initialization already enabled ftrace event.
+                __trace_add_event_call(*call, NULL, &ftrace_event_id_fops,
-         * Now it's only necessary to create the event directory.
-         */
-        list_for_each_entry(call, &ftrace_events, list) {
-                ret = event_create_dir(call, d_events,
-                                       &ftrace_event_id_fops,
                                       &ftrace_enable_fops,
                                       &ftrace_event_filter_fops,
                                       &ftrace_event_format_fops);
-                if (ret < 0)
+        }
-                        event_remove(call);
+        while (true) {
+                token = strsep(&buf, ",");
+                if (!token)
+                        break;
+                if (!*token)
+                        continue;
+                ret = ftrace_set_clr_event(token, 1);
+                if (ret)
+                        pr_warning("Failed to enable trace event: %s\n", token);
        }
        ret = register_module_notifier(&trace_module_nb);
@@ -1569,7 +1514,6 @@ static __init int event_trace_init(void)
        return 0;
 }
-core_initcall(event_trace_enable);
 fs_initcall(event_trace_init);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1693,11 +1637,9 @@ static __init void event_trace_self_tests(void)
                event_test_stuff();
                ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
-                if (WARN_ON_ONCE(ret)) {
+                if (WARN_ON_ONCE(ret))
                        pr_warning("error disabling system %s\n",
                                   system->name);
-                        continue;
-                }
                pr_cont("OK\n");
        }
@@ -1730,8 +1672,7 @@ static __init void event_trace_self_tests(void)
 static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
 static void
-function_test_events_call(unsigned long ip, unsigned long parent_ip,
+function_test_events_call(unsigned long ip, unsigned long parent_ip)
-                          struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
@@ -1760,7 +1701,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
        entry->ip                       = ip;
        entry->parent_ip                = parent_ip;
-        trace_buffer_unlock_commit(buffer, event, flags, pc);
+        trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
 out:
        atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
@@ -1770,7 +1711,6 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
 static struct ftrace_ops trace_ops __initdata  =
 {
        .func = function_test_events_call,
-        .flags = FTRACE_OPS_FL_RECURSION_SAFE,
 };
 static __init void event_trace_self_test_with_function(void)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e5b0ca8b8d4..bd3c6369f80 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,12 +27,6 @@
 #include "trace.h"
 #include "trace_output.h"
-#define DEFAULT_SYS_FILTER_MESSAGE                                      \
-        "### global filter ###\n"                                       \
-        "# Use this to set filters for multiple events.\n"              \
-        "# Only events with the given fields will be affected.\n"       \
-        "# If no events are modified, an error message will be displayed here"
 enum filter_op_ids
 {
        OP_OR,
@@ -81,7 +75,6 @@ enum {
        FILT_ERR_TOO_MANY_PREDS,
        FILT_ERR_MISSING_FIELD,
        FILT_ERR_INVALID_FILTER,
-        FILT_ERR_IP_FIELD_ONLY,
 };
 static char *err_text[] = {
@@ -97,7 +90,6 @@ static char *err_text[] = {
        "Too many terms in predicate expression",
        "Missing field name and/or value",
        "Meaningless filter expression",
-        "Only 'ip' field is supported for function trace",
 };
 struct opstack_op {
@@ -389,63 +381,6 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
        return pred;
 }
-enum walk_return {
-        WALK_PRED_ABORT,
-        WALK_PRED_PARENT,
-        WALK_PRED_DEFAULT,
-};
-typedef int (*filter_pred_walkcb_t) (enum move_type move,
-                                     struct filter_pred *pred,
-                                     int *err, void *data);
-static int walk_pred_tree(struct filter_pred *preds,
-                          struct filter_pred *root,
-                          filter_pred_walkcb_t cb, void *data)
-{
-        struct filter_pred *pred = root;
-        enum move_type move = MOVE_DOWN;
-        int done = 0;
-        if  (!preds)
-                return -EINVAL;
-        do {
-                int err = 0, ret;
-                ret = cb(move, pred, &err, data);
-                if (ret == WALK_PRED_ABORT)
-                        return err;
-                if (ret == WALK_PRED_PARENT)
-                        goto get_parent;
-                switch (move) {
-                case MOVE_DOWN:
-                        if (pred->left != FILTER_PRED_INVALID) {
-                                pred = &preds[pred->left];
-                                continue;
-                        }
-                        goto get_parent;
-                case MOVE_UP_FROM_LEFT:
-                        pred = &preds[pred->right];
-                        move = MOVE_DOWN;
-                        continue;
-                case MOVE_UP_FROM_RIGHT:
- get_parent:
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent,
-                                               &move);
-                        continue;
-                }
-                done = 1;
-        } while (!done);
-        /* We are fine. */
-        return 0;
-}
 /*
 * A series of AND or ORs where found together. Instead of
 * climbing up and down the tree branches, an array of the
@@ -475,91 +410,99 @@ static int process_ops(struct filter_pred *preds,
        for (i = 0; i < op->val; i++) {
                pred = &preds[op->ops[i]];
-                if (!WARN_ON_ONCE(!pred->fn))
+                match = pred->fn(pred, rec);
-                        match = pred->fn(pred, rec);
                if (!!match == type)
                        return match;
        }
        return match;
 }
-struct filter_match_preds_data {
-        struct filter_pred *preds;
-        int match;
-        void *rec;
-};
-static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
-                                 int *err, void *data)
-{
-        struct filter_match_preds_data *d = data;
-        *err = 0;
-        switch (move) {
-        case MOVE_DOWN:
-                /* only AND and OR have children */
-                if (pred->left != FILTER_PRED_INVALID) {
-                        /* If ops is set, then it was folded. */
-                        if (!pred->ops)
-                                return WALK_PRED_DEFAULT;
-                        /* We can treat folded ops as a leaf node */
-                        d->match = process_ops(d->preds, pred, d->rec);
-                } else {
-                        if (!WARN_ON_ONCE(!pred->fn))
-                                d->match = pred->fn(pred, d->rec);
-                }
-                return WALK_PRED_PARENT;
-        case MOVE_UP_FROM_LEFT:
-                /*
-                 * Check for short circuits.
-                 *
-                 * Optimization: !!match == (pred->op == OP_OR)
-                 *   is the same as:
-                 * if ((match && pred->op == OP_OR) ||
-                 *     (!match && pred->op == OP_AND))
-                 */
-                if (!!d->match == (pred->op == OP_OR))
-                        return WALK_PRED_PARENT;
-                break;
-        case MOVE_UP_FROM_RIGHT:
-                break;
-        }
-        return WALK_PRED_DEFAULT;
-}
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
+        int match = -1;
+        enum move_type move = MOVE_DOWN;
        struct filter_pred *preds;
+        struct filter_pred *pred;
        struct filter_pred *root;
-        struct filter_match_preds_data data = {
+        int n_preds;
-                /* match is currently meaningless */
+        int done = 0;
-                .match = -1,
-                .rec   = rec,
-        };
-        int n_preds, ret;
        /* no filter is considered a match */
        if (!filter)
                return 1;
        n_preds = filter->n_preds;
        if (!n_preds)
                return 1;
        /*
         * n_preds, root and filter->preds are protect with preemption disabled.
         */
+        preds = rcu_dereference_sched(filter->preds);
        root = rcu_dereference_sched(filter->root);
        if (!root)
                return 1;
-        data.preds = preds = rcu_dereference_sched(filter->preds);
+        pred = root;
-        ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
-        WARN_ON(ret);
+        /* match is currently meaningless */
-        return data.match;
+        match = -1;
+        do {
+                switch (move) {
+                case MOVE_DOWN:
+                        /* only AND and OR have children */
+                        if (pred->left != FILTER_PRED_INVALID) {
+                                /* If ops is set, then it was folded. */
+                                if (!pred->ops) {
+                                        /* keep going to down the left side */
+                                        pred = &preds[pred->left];
+                                        continue;
+                                }
+                                /* We can treat folded ops as a leaf node */
+                                match = process_ops(preds, pred, rec);
+                        } else
+                                match = pred->fn(pred, rec);
+                        /* If this pred is the only pred */
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        /*
+                         * Check for short circuits.
+                         *
+                         * Optimization: !!match == (pred->op == OP_OR)
+                         *   is the same as:
+                         * if ((match && pred->op == OP_OR) ||
+                         *     (!match && pred->op == OP_AND))
+                         */
+                        if (!!match == (pred->op == OP_OR)) {
+                                if (pred == root)
+                                        break;
+                                pred = get_pred_parent(pred, preds,
+                                                       pred->parent, &move);
+                                continue;
+                        }
+                        /* now go down the right side of the tree. */
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        /* We finished this equation. */
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
+        return match;
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
@@ -654,7 +597,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
        if (filter && filter->filter_string)
                trace_seq_printf(s, "%s\n", filter->filter_string);
        else
-                trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
+                trace_seq_printf(s, "none\n");
        mutex_unlock(&event_mutex);
 }
@@ -685,9 +628,25 @@ find_event_field(struct ftrace_event_call *call, char *name)
        return __find_event_field(head, name);
 }
+static void filter_free_pred(struct filter_pred *pred)
+{
+        if (!pred)
+                return;
+        kfree(pred->field_name);
+        kfree(pred);
+}
+static void filter_clear_pred(struct filter_pred *pred)
+{
+        kfree(pred->field_name);
+        pred->field_name = NULL;
+        pred->regex.len = 0;
+}
 static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
 {
-        stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
+        stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
        if (!stack->preds)
                return -ENOMEM;
        stack->index = n_preds;
@@ -730,13 +689,20 @@ __pop_pred_stack(struct pred_stack *stack)
 static int filter_set_pred(struct event_filter *filter,
                           int idx,
                           struct pred_stack *stack,
-                           struct filter_pred *src)
+                           struct filter_pred *src,
+                           filter_pred_fn_t fn)
 {
        struct filter_pred *dest = &filter->preds[idx];
        struct filter_pred *left;
        struct filter_pred *right;
        *dest = *src;
+        if (src->field_name) {
+                dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+                if (!dest->field_name)
+                        return -ENOMEM;
+        }
+        dest->fn = fn;
        dest->index = idx;
        if (dest->op == OP_OR || dest->op == OP_AND) {
@@ -777,7 +743,11 @@ static int filter_set_pred(struct event_filter *filter,
 static void __free_preds(struct event_filter *filter)
 {
+        int i;
        if (filter->preds) {
+                for (i = 0; i < filter->a_preds; i++)
+                        kfree(filter->preds[i].field_name);
                kfree(filter->preds);
                filter->preds = NULL;
        }
@@ -828,7 +798,8 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
        if (filter->preds)
                __free_preds(filter);
-        filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL);
+        filter->preds =
+                kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
        if (!filter->preds)
                return -ENOMEM;
@@ -869,19 +840,23 @@ static void filter_free_subsystem_filters(struct event_subsystem *system)
        }
 }
-static int filter_add_pred(struct filter_parse_state *ps,
+static int filter_add_pred_fn(struct filter_parse_state *ps,
-                           struct event_filter *filter,
+                              struct ftrace_event_call *call,
-                           struct filter_pred *pred,
+                              struct event_filter *filter,
-                           struct pred_stack *stack)
+                              struct filter_pred *pred,
+                              struct pred_stack *stack,
+                              filter_pred_fn_t fn)
 {
-        int err;
+        int idx, err;
        if (WARN_ON(filter->n_preds == filter->a_preds)) {
                parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
                return -ENOSPC;
        }
-        err = filter_set_pred(filter, filter->n_preds, stack, pred);
+        idx = filter->n_preds;
+        filter_clear_pred(&filter->preds[idx]);
+        err = filter_set_pred(filter, idx, stack, pred, fn);
        if (err)
                return err;
@@ -901,11 +876,6 @@ int filter_assign_type(const char *type)
        return FILTER_OTHER;
 }
-static bool is_function_field(struct ftrace_event_field *field)
-{
-        return field->filter_type == FILTER_TRACE_FN;
-}
 static bool is_string_field(struct ftrace_event_field *field)
 {
        return field->filter_type == FILTER_DYN_STRING ||
@@ -967,15 +937,31 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
        return fn;
 }
-static int init_pred(struct filter_parse_state *ps,
+static int filter_add_pred(struct filter_parse_state *ps,
-                     struct ftrace_event_field *field,
+                           struct ftrace_event_call *call,
-                     struct filter_pred *pred)
+                           struct event_filter *filter,
+                           struct filter_pred *pred,
+                           struct pred_stack *stack,
+                           bool dry_run)
 {
-        filter_pred_fn_t fn = filter_pred_none;
+        struct ftrace_event_field *field;
+        filter_pred_fn_t fn;
        unsigned long long val;
        int ret;
+        fn = pred->fn = filter_pred_none;
+        if (pred->op == OP_AND)
+                goto add_pred_fn;
+        else if (pred->op == OP_OR)
+                goto add_pred_fn;
+        field = find_event_field(call, pred->field_name);
+        if (!field) {
+                parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
+                return -EINVAL;
+        }
        pred->offset = field->offset;
        if (!is_legal_op(field, pred->op)) {
@@ -993,16 +979,11 @@ static int init_pred(struct filter_parse_state *ps,
                        fn = filter_pred_strloc;
                else
                        fn = filter_pred_pchar;
-        } else if (is_function_field(field)) {
-                if (strcmp(field->name, "ip")) {
-                        parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0);
-                        return -EINVAL;
-                }
        } else {
                if (field->is_signed)
-                        ret = kstrtoll(pred->regex.pattern, 0, &val);
+                        ret = strict_strtoll(pred->regex.pattern, 0, &val);
                else
-                        ret = kstrtoull(pred->regex.pattern, 0, &val);
+                        ret = strict_strtoull(pred->regex.pattern, 0, &val);
                if (ret) {
                        parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
                        return -EINVAL;
@@ -1020,7 +1001,9 @@ static int init_pred(struct filter_parse_state *ps,
        if (pred->op == OP_NE)
                pred->not = 1;
-        pred->fn = fn;
+add_pred_fn:
+        if (!dry_run)
+                return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
        return 0;
 }
@@ -1319,34 +1302,39 @@ parse_operand:
        return 0;
 }
-static struct filter_pred *create_pred(struct filter_parse_state *ps,
+static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
-                                       struct ftrace_event_call *call,
-                                       int op, char *operand1, char *operand2)
 {
-        struct ftrace_event_field *field;
+        struct filter_pred *pred;
-        static struct filter_pred pred;
-        memset(&pred, 0, sizeof(pred));
-        pred.op = op;
-        if (op == OP_AND || op == OP_OR)
+        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-                return &pred;
+        if (!pred)
+                return NULL;
-        if (!operand1 || !operand2) {
+        pred->field_name = kstrdup(operand1, GFP_KERNEL);
-                parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+        if (!pred->field_name) {
+                kfree(pred);
                return NULL;
        }
-        field = find_event_field(call, operand1);
+        strcpy(pred->regex.pattern, operand2);
-        if (!field) {
+        pred->regex.len = strlen(pred->regex.pattern);
-                parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
+        pred->op = op;
+        return pred;
+}
+static struct filter_pred *create_logical_pred(int op)
+{
+        struct filter_pred *pred;
+        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+        if (!pred)
                return NULL;
-        }
-        strcpy(pred.regex.pattern, operand2);
+        pred->op = op;
-        pred.regex.len = strlen(pred.regex.pattern);
-        pred.field = field;
+        return pred;
-        return init_pred(ps, field, &pred) ? NULL : &pred;
 }
 static int check_preds(struct filter_parse_state *ps)
@@ -1387,23 +1375,6 @@ static int count_preds(struct filter_parse_state *ps)
        return n_preds;
 }
-struct check_pred_data {
-        int count;
-        int max;
-};
-static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
-                              int *err, void *data)
-{
-        struct check_pred_data *d = data;
-        if (WARN_ON(d->count++ > d->max)) {
-                *err = -EINVAL;
-                return WALK_PRED_ABORT;
-        }
-        return WALK_PRED_DEFAULT;
-}
 /*
 * The tree is walked at filtering of an event. If the tree is not correctly
 * built, it may cause an infinite loop. Check here that the tree does
@@ -1412,76 +1383,107 @@ static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
 static int check_pred_tree(struct event_filter *filter,
                           struct filter_pred *root)
 {
-        struct check_pred_data data = {
+        struct filter_pred *preds;
-                /*
+        struct filter_pred *pred;
-                 * The max that we can hit a node is three times.
+        enum move_type move = MOVE_DOWN;
-                 * Once going down, once coming up from left, and
+        int count = 0;
-                 * once coming up from right. This is more than enough
+        int done = 0;
-                 * since leafs are only hit a single time.
+        int max;
-                 */
-                .max   = 3 * filter->n_preds,
-                .count = 0,
-        };
-        return walk_pred_tree(filter->preds, root,
-                              check_pred_tree_cb, &data);
-}
-static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
+        /*
-                          int *err, void *data)
+         * The max that we can hit a node is three times.
-{
+         * Once going down, once coming up from left, and
-        int *count = data;
+         * once coming up from right. This is more than enough
+         * since leafs are only hit a single time.
+         */
+        max = 3 * filter->n_preds;
-        if ((move == MOVE_DOWN) &&
+        preds = filter->preds;
-            (pred->left == FILTER_PRED_INVALID))
+        if  (!preds)
-                (*count)++;
+                return -EINVAL;
+        pred = root;
-        return WALK_PRED_DEFAULT;
+        do {
-}
+                if (WARN_ON(count++ > max))
+                        return -EINVAL;
-static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
+                switch (move) {
-{
+                case MOVE_DOWN:
-        int count = 0, ret;
+                        if (pred->left != FILTER_PRED_INVALID) {
+                                pred = &preds[pred->left];
+                                continue;
+                        }
+                        /* A leaf at the root is just a leaf in the tree */
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
-        ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
+        /* We are fine. */
-        WARN_ON(ret);
+        return 0;
-        return count;
 }
-struct fold_pred_data {
+static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
-        struct filter_pred *root;
-        int count;
-        int children;
-};
-static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
-                        int *err, void *data)
 {
-        struct fold_pred_data *d = data;
+        struct filter_pred *pred;
-        struct filter_pred *root = d->root;
+        enum move_type move = MOVE_DOWN;
+        int count = 0;
+        int done = 0;
-        if (move != MOVE_DOWN)
+        pred = root;
-                return WALK_PRED_DEFAULT;
-        if (pred->left != FILTER_PRED_INVALID)
-                return WALK_PRED_DEFAULT;
-        if (WARN_ON(d->count == d->children)) {
+        do {
-                *err = -EINVAL;
+                switch (move) {
-                return WALK_PRED_ABORT;
+                case MOVE_DOWN:
-        }
+                        if (pred->left != FILTER_PRED_INVALID) {
+                                pred = &preds[pred->left];
+                                continue;
+                        }
+                        /* A leaf at the root is just a leaf in the tree */
+                        if (pred == root)
+                                return 1;
+                        count++;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
-        pred->index &= ~FILTER_PRED_FOLD;
+        return count;
-        root->ops[d->count++] = pred->index;
-        return WALK_PRED_DEFAULT;
 }
 static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
 {
-        struct fold_pred_data data = {
+        struct filter_pred *pred;
-                .root  = root,
+        enum move_type move = MOVE_DOWN;
-                .count = 0,
+        int count = 0;
-        };
        int children;
+        int done = 0;
        /* No need to keep the fold flag */
        root->index &= ~FILTER_PRED_FOLD;
@@ -1494,31 +1496,42 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
        children = count_leafs(preds, &preds[root->left]);
        children += count_leafs(preds, &preds[root->right]);
-        root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL);
+        root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
        if (!root->ops)
                return -ENOMEM;
        root->val = children;
-        data.children = children;
-        return walk_pred_tree(preds, root, fold_pred_cb, &data);
-}
-static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
-                             int *err, void *data)
-{
-        struct filter_pred *preds = data;
-        if (move != MOVE_DOWN)
+        pred = root;
-                return WALK_PRED_DEFAULT;
+        do {
-        if (!(pred->index & FILTER_PRED_FOLD))
+                switch (move) {
-                return WALK_PRED_DEFAULT;
+                case MOVE_DOWN:
+                        if (pred->left != FILTER_PRED_INVALID) {
-        *err = fold_pred(preds, pred);
+                                pred = &preds[pred->left];
-        if (*err)
+                                continue;
-                return WALK_PRED_ABORT;
+                        }
+                        if (WARN_ON(count == children))
+                                return -EINVAL;
+                        pred->index &= ~FILTER_PRED_FOLD;
+                        root->ops[count++] = pred->index;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
-        /* eveyrhing below is folded, continue with parent */
+        return 0;
-        return WALK_PRED_PARENT;
 }
 /*
@@ -1529,8 +1542,51 @@ static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
 static int fold_pred_tree(struct event_filter *filter,
                           struct filter_pred *root)
 {
-        return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
+        struct filter_pred *preds;
-                              filter->preds);
+        struct filter_pred *pred;
+        enum move_type move = MOVE_DOWN;
+        int done = 0;
+        int err;
+        preds = filter->preds;
+        if  (!preds)
+                return -EINVAL;
+        pred = root;
+        do {
+                switch (move) {
+                case MOVE_DOWN:
+                        if (pred->index & FILTER_PRED_FOLD) {
+                                err = fold_pred(preds, pred);
+                                if (err)
+                                        return err;
+                                /* Folded nodes are like leafs */
+                        } else if (pred->left != FILTER_PRED_INVALID) {
+                                pred = &preds[pred->left];
+                                continue;
+                        }
+                        /* A leaf at the root is just a leaf in the tree */
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent, &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
+        return 0;
 }
 static int replace_preds(struct ftrace_event_call *call,
@@ -1587,17 +1643,27 @@ static int replace_preds(struct ftrace_event_call *call,
                        goto fail;
                }
-                pred = create_pred(ps, call, elt->op, operand1, operand2);
+                if (elt->op == OP_AND || elt->op == OP_OR) {
-                if (!pred) {
+                        pred = create_logical_pred(elt->op);
+                        goto add_pred;
+                }
+                if (!operand1 || !operand2) {
+                        parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
                        err = -EINVAL;
                        goto fail;
                }
-                if (!dry_run) {
+                pred = create_pred(elt->op, operand1, operand2);
-                        err = filter_add_pred(ps, filter, pred, &stack);
+add_pred:
-                        if (err)
+                if (!pred) {
-                                goto fail;
+                        err = -ENOMEM;
+                        goto fail;
                }
+                err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
+                filter_free_pred(pred);
+                if (err)
+                        goto fail;
                operand1 = operand2 = NULL;
        }
@@ -1663,9 +1729,7 @@ static int replace_system_preds(struct event_subsystem *system,
                 */
                err = replace_preds(call, NULL, ps, filter_string, true);
                if (err)
-                        call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
+                        goto fail;
-                else
-                        call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
        }
        list_for_each_entry(call, &ftrace_events, list) {
@@ -1674,9 +1738,6 @@ static int replace_system_preds(struct event_subsystem *system,
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
-                if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)
-                        continue;
                filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
                if (!filter_item)
                        goto fail_mem;
@@ -1746,121 +1807,11 @@ static int replace_system_preds(struct event_subsystem *system,
        return -ENOMEM;
 }
-static int create_filter_start(char *filter_str, bool set_str,
-                               struct filter_parse_state **psp,
-                               struct event_filter **filterp)
-{
-        struct event_filter *filter;
-        struct filter_parse_state *ps = NULL;
-        int err = 0;
-        WARN_ON_ONCE(*psp || *filterp);
-        /* allocate everything, and if any fails, free all and fail */
-        filter = __alloc_filter();
-        if (filter && set_str)
-                err = replace_filter_string(filter, filter_str);
-        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-        if (!filter || !ps || err) {
-                kfree(ps);
-                __free_filter(filter);
-                return -ENOMEM;
-        }
-        /* we're committed to creating a new filter */
-        *filterp = filter;
-        *psp = ps;
-        parse_init(ps, filter_ops, filter_str);
-        err = filter_parse(ps);
-        if (err && set_str)
-                append_filter_err(ps, filter);
-        return err;
-}
-static void create_filter_finish(struct filter_parse_state *ps)
-{
-        if (ps) {
-                filter_opstack_clear(ps);
-                postfix_clear(ps);
-                kfree(ps);
-        }
-}
-/**
- * create_filter - create a filter for a ftrace_event_call
- * @call: ftrace_event_call to create a filter for
- * @filter_str: filter string
- * @set_str: remember @filter_str and enable detailed error in filter
- * @filterp: out param for created filter (always updated on return)
- *
- * Creates a filter for @call with @filter_str.  If @set_str is %true,
- * @filter_str is copied and recorded in the new filter.
- *
- * On success, returns 0 and *@filterp points to the new filter.  On
- * failure, returns -errno and *@filterp may point to %NULL or to a new
- * filter.  In the latter case, the returned filter contains error
- * information if @set_str is %true and the caller is responsible for
- * freeing it.
- */
-static int create_filter(struct ftrace_event_call *call,
-                         char *filter_str, bool set_str,
-                         struct event_filter **filterp)
-{
-        struct event_filter *filter = NULL;
-        struct filter_parse_state *ps = NULL;
-        int err;
-        err = create_filter_start(filter_str, set_str, &ps, &filter);
-        if (!err) {
-                err = replace_preds(call, filter, ps, filter_str, false);
-                if (err && set_str)
-                        append_filter_err(ps, filter);
-        }
-        create_filter_finish(ps);
-        *filterp = filter;
-        return err;
-}
-/**
- * create_system_filter - create a filter for an event_subsystem
- * @system: event_subsystem to create a filter for
- * @filter_str: filter string
- * @filterp: out param for created filter (always updated on return)
- *
- * Identical to create_filter() except that it creates a subsystem filter
- * and always remembers @filter_str.
- */
-static int create_system_filter(struct event_subsystem *system,
-                                char *filter_str, struct event_filter **filterp)
-{
-        struct event_filter *filter = NULL;
-        struct filter_parse_state *ps = NULL;
-        int err;
-        err = create_filter_start(filter_str, true, &ps, &filter);
-        if (!err) {
-                err = replace_system_preds(system, ps, filter_str);
-                if (!err) {
-                        /* System filters just show a default message */
-                        kfree(filter->filter_string);
-                        filter->filter_string = NULL;
-                } else {
-                        append_filter_err(ps, filter);
-                }
-        }
-        create_filter_finish(ps);
-        *filterp = filter;
-        return err;
-}
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 {
+        struct filter_parse_state *ps;
        struct event_filter *filter;
+        struct event_filter *tmp;
        int err = 0;
        mutex_lock(&event_mutex);
@@ -1877,30 +1828,49 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
                goto out_unlock;
        }
-        err = create_filter(call, filter_string, true, &filter);
+        err = -ENOMEM;
+        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+        if (!ps)
+                goto out_unlock;
+        filter = __alloc_filter();
+        if (!filter) {
+                kfree(ps);
+                goto out_unlock;
+        }
+        replace_filter_string(filter, filter_string);
+        parse_init(ps, filter_ops, filter_string);
+        err = filter_parse(ps);
+        if (err) {
+                append_filter_err(ps, filter);
+                goto out;
+        }
+        err = replace_preds(call, filter, ps, filter_string, false);
+        if (err) {
+                filter_disable(call);
+                append_filter_err(ps, filter);
+        } else
+                call->flags |= TRACE_EVENT_FL_FILTERED;
+out:
        /*
         * Always swap the call filter with the new filter
         * even if there was an error. If there was an error
         * in the filter, we disable the filter and show the error
         * string
         */
-        if (filter) {
+        tmp = call->filter;
-                struct event_filter *tmp = call->filter;
+        rcu_assign_pointer(call->filter, filter);
+        if (tmp) {
-                if (!err)
+                /* Make sure the call is done with the filter */
-                        call->flags |= TRACE_EVENT_FL_FILTERED;
+                synchronize_sched();
-                else
+                __free_filter(tmp);
-                        filter_disable(call);
-                rcu_assign_pointer(call->filter, filter);
-                if (tmp) {
-                        /* Make sure the call is done with the filter */
-                        synchronize_sched();
-                        __free_filter(tmp);
-                }
        }
+        filter_opstack_clear(ps);
+        postfix_clear(ps);
+        kfree(ps);
 out_unlock:
        mutex_unlock(&event_mutex);
@@ -1910,6 +1880,7 @@ out_unlock:
 int apply_subsystem_event_filter(struct event_subsystem *system,
                                 char *filter_string)
 {
+        struct filter_parse_state *ps;
        struct event_filter *filter;
        int err = 0;
@@ -1933,15 +1904,38 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
                goto out_unlock;
        }
-        err = create_system_filter(system, filter_string, &filter);
+        err = -ENOMEM;
-        if (filter) {
+        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
-                /*
+        if (!ps)
-                 * No event actually uses the system filter
+                goto out_unlock;
-                 * we can free it without synchronize_sched().
-                 */
+        filter = __alloc_filter();
-                __free_filter(system->filter);
+        if (!filter)
-                system->filter = filter;
+                goto out;
+        replace_filter_string(filter, filter_string);
+        /*
+         * No event actually uses the system filter
+         * we can free it without synchronize_sched().
+         */
+        __free_filter(system->filter);
+        system->filter = filter;
+        parse_init(ps, filter_ops, filter_string);
+        err = filter_parse(ps);
+        if (err) {
+                append_filter_err(ps, system->filter);
+                goto out;
        }
+        err = replace_system_preds(system, ps, filter_string);
+        if (err)
+                append_filter_err(ps, system->filter);
+out:
+        filter_opstack_clear(ps);
+        postfix_clear(ps);
+        kfree(ps);
 out_unlock:
        mutex_unlock(&event_mutex);
@@ -1958,178 +1952,56 @@ void ftrace_profile_free_filter(struct perf_event *event)
        __free_filter(filter);
 }
-struct function_filter_data {
-        struct ftrace_ops *ops;
-        int first_filter;
-        int first_notrace;
-};
-#ifdef CONFIG_FUNCTION_TRACER
-static char **
-ftrace_function_filter_re(char *buf, int len, int *count)
-{
-        char *str, *sep, **re;
-        str = kstrndup(buf, len, GFP_KERNEL);
-        if (!str)
-                return NULL;
-        /*
-         * The argv_split function takes white space
-         * as a separator, so convert ',' into spaces.
-         */
-        while ((sep = strchr(str, ',')))
-                *sep = ' ';
-        re = argv_split(GFP_KERNEL, str, count);
-        kfree(str);
-        return re;
-}
-static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter,
-                                      int reset, char *re, int len)
-{
-        int ret;
-        if (filter)
-                ret = ftrace_set_filter(ops, re, len, reset);
-        else
-                ret = ftrace_set_notrace(ops, re, len, reset);
-        return ret;
-}
-static int __ftrace_function_set_filter(int filter, char *buf, int len,
-                                        struct function_filter_data *data)
-{
-        int i, re_cnt, ret = -EINVAL;
-        int *reset;
-        char **re;
-        reset = filter ? &data->first_filter : &data->first_notrace;
-        /*
-         * The 'ip' field could have multiple filters set, separated
-         * either by space or comma. We first cut the filter and apply
-         * all pieces separatelly.
-         */
-        re = ftrace_function_filter_re(buf, len, &re_cnt);
-        if (!re)
-                return -EINVAL;
-        for (i = 0; i < re_cnt; i++) {
-                ret = ftrace_function_set_regexp(data->ops, filter, *reset,
-                                                 re[i], strlen(re[i]));
-                if (ret)
-                        break;
-                if (*reset)
-                        *reset = 0;
-        }
-        argv_free(re);
-        return ret;
-}
-static int ftrace_function_check_pred(struct filter_pred *pred, int leaf)
-{
-        struct ftrace_event_field *field = pred->field;
-        if (leaf) {
-                /*
-                 * Check the leaf predicate for function trace, verify:
-                 *  - only '==' and '!=' is used
-                 *  - the 'ip' field is used
-                 */
-                if ((pred->op != OP_EQ) && (pred->op != OP_NE))
-                        return -EINVAL;
-                if (strcmp(field->name, "ip"))
-                        return -EINVAL;
-        } else {
-                /*
-                 * Check the non leaf predicate for function trace, verify:
-                 *  - only '||' is used
-                */
-                if (pred->op != OP_OR)
-                        return -EINVAL;
-        }
-        return 0;
-}
-static int ftrace_function_set_filter_cb(enum move_type move,
-                                         struct filter_pred *pred,
-                                         int *err, void *data)
-{
-        /* Checking the node is valid for function trace. */
-        if ((move != MOVE_DOWN) ||
-            (pred->left != FILTER_PRED_INVALID)) {
-                *err = ftrace_function_check_pred(pred, 0);
-        } else {
-                *err = ftrace_function_check_pred(pred, 1);
-                if (*err)
-                        return WALK_PRED_ABORT;
-                *err = __ftrace_function_set_filter(pred->op == OP_EQ,
-                                                    pred->regex.pattern,
-                                                    pred->regex.len,
-                                                    data);
-        }
-        return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT;
-}
-static int ftrace_function_set_filter(struct perf_event *event,
-                                      struct event_filter *filter)
-{
-        struct function_filter_data data = {
-                .first_filter  = 1,
-                .first_notrace = 1,
-                .ops           = &event->ftrace_ops,
-        };
-        return walk_pred_tree(filter->preds, filter->root,
-                              ftrace_function_set_filter_cb, &data);
-}
-#else
-static int ftrace_function_set_filter(struct perf_event *event,
-                                      struct event_filter *filter)
-{
-        return -ENODEV;
-}
-#endif /* CONFIG_FUNCTION_TRACER */
 int ftrace_profile_set_filter(struct perf_event *event, int event_id,
                              char *filter_str)
 {
        int err;
        struct event_filter *filter;
-        struct ftrace_event_call *call;
+        struct filter_parse_state *ps;
+        struct ftrace_event_call *call = NULL;
        mutex_lock(&event_mutex);
-        call = event->tp_event;
+        list_for_each_entry(call, &ftrace_events, list) {
+                if (call->event.type == event_id)
+                        break;
+        }
        err = -EINVAL;
-        if (!call)
+        if (&call->list == &ftrace_events)
                goto out_unlock;
        err = -EEXIST;
        if (event->filter)
                goto out_unlock;
-        err = create_filter(call, filter_str, false, &filter);
+        filter = __alloc_filter();
-        if (err)
+        if (!filter) {
+                err = PTR_ERR(filter);
+                goto out_unlock;
+        }
+        err = -ENOMEM;
+        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+        if (!ps)
                goto free_filter;
-        if (ftrace_event_is_function(call))
+        parse_init(ps, filter_ops, filter_str);
-                err = ftrace_function_set_filter(event, filter);
+        err = filter_parse(ps);
-        else
+        if (err)
+                goto free_ps;
+        err = replace_preds(call, filter, ps, filter_str, false);
+        if (!err)
                event->filter = filter;
+free_ps:
+        filter_opstack_clear(ps);
+        postfix_clear(ps);
+        kfree(ps);
 free_filter:
-        if (err || ftrace_event_is_function(call))
+        if (err)
                __free_filter(filter);
 out_unlock:
@@ -2140,179 +2012,3 @@ out_unlock:
 #endif /* CONFIG_PERF_EVENTS */
-#ifdef CONFIG_FTRACE_STARTUP_TEST
-#include <linux/types.h>
-#include <linux/tracepoint.h>
-#define CREATE_TRACE_POINTS
-#include "trace_events_filter_test.h"
-#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
-{ \
-        .filter = FILTER, \
-        .rec    = { .a = va, .b = vb, .c = vc, .d = vd, \
-                    .e = ve, .f = vf, .g = vg, .h = vh }, \
-        .match  = m, \
-        .not_visited = nvisit, \
-}
-#define YES 1
-#define NO  0
-static struct test_filter_data_t {
-        char *filter;
-        struct ftrace_raw_ftrace_test_filter rec;
-        int match;
-        char *not_visited;
-} test_filter_data[] = {
-#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \
-               "e == 1 && f == 1 && g == 1 && h == 1"
-        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""),
-        DATA_REC(NO,  0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"),
-        DATA_REC(NO,  1, 1, 1, 1, 1, 1, 1, 0, ""),
-#undef FILTER
-#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \
-               "e == 1 || f == 1 || g == 1 || h == 1"
-        DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 0, ""),
-        DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""),
-        DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"),
-#undef FILTER
-#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \
-               "(e == 1 || f == 1) && (g == 1 || h == 1)"
-        DATA_REC(NO,  0, 0, 1, 1, 1, 1, 1, 1, "dfh"),
-        DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
-        DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"),
-        DATA_REC(NO,  1, 0, 1, 0, 0, 1, 0, 0, "bd"),
-#undef FILTER
-#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \
-               "(e == 1 && f == 1) || (g == 1 && h == 1)"
-        DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"),
-        DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""),
-        DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 1, ""),
-#undef FILTER
-#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \
-               "(e == 1 && f == 1) || (g == 1 && h == 1)"
-        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"),
-        DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 1, ""),
-        DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""),
-#undef FILTER
-#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \
-               "(e == 1 || f == 1)) && (g == 1 || h == 1)"
-        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"),
-        DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 0, ""),
-        DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"),
-#undef FILTER
-#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \
-               "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))"
-        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"),
-        DATA_REC(NO,  0, 1, 0, 1, 0, 1, 0, 1, ""),
-        DATA_REC(NO,  1, 0, 1, 0, 1, 0, 1, 0, ""),
-#undef FILTER
-#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \
-               "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))"
-        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"),
-        DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
-        DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"),
-};
-#undef DATA_REC
-#undef FILTER
-#undef YES
-#undef NO
-#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
-static int test_pred_visited;
-static int test_pred_visited_fn(struct filter_pred *pred, void *event)
-{
-        struct ftrace_event_field *field = pred->field;
-        test_pred_visited = 1;
-        printk(KERN_INFO "\npred visited %s\n", field->name);
-        return 1;
-}
-static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
-                             int *err, void *data)
-{
-        char *fields = data;
-        if ((move == MOVE_DOWN) &&
-            (pred->left == FILTER_PRED_INVALID)) {
-                struct ftrace_event_field *field = pred->field;
-                if (!field) {
-                        WARN(1, "all leafs should have field defined");
-                        return WALK_PRED_DEFAULT;
-                }
-                if (!strchr(fields, *field->name))
-                        return WALK_PRED_DEFAULT;
-                WARN_ON(!pred->fn);
-                pred->fn = test_pred_visited_fn;
-        }
-        return WALK_PRED_DEFAULT;
-}
-static __init int ftrace_test_event_filter(void)
-{
-        int i;
-        printk(KERN_INFO "Testing ftrace filter: ");
-        for (i = 0; i < DATA_CNT; i++) {
-                struct event_filter *filter = NULL;
-                struct test_filter_data_t *d = &test_filter_data[i];
-                int err;
-                err = create_filter(&event_ftrace_test_filter, d->filter,
-                                    false, &filter);
-                if (err) {
-                        printk(KERN_INFO
-                               "Failed to get filter for '%s', err %d\n",
-                               d->filter, err);
-                        __free_filter(filter);
-                        break;
-                }
-                /*
-                 * The preemption disabling is not really needed for self
-                 * tests, but the rcu dereference will complain without it.
-                 */
-                preempt_disable();
-                if (*d->not_visited)
-                        walk_pred_tree(filter->preds, filter->root,
-                                       test_walk_pred_cb,
-                                       d->not_visited);
-                test_pred_visited = 0;
-                err = filter_match_preds(filter, &d->rec);
-                preempt_enable();
-                __free_filter(filter);
-                if (test_pred_visited) {
-                        printk(KERN_INFO
-                               "Failed, unwanted pred visited for filter %s\n",
-                               d->filter);
-                        break;
-                }
-                if (err != d->match) {
-                        printk(KERN_INFO
-                               "Failed to match filter '%s', expected %d\n",
-                               d->filter, d->match);
-                        break;
-                }
-        }
-        if (i == DATA_CNT)
-                printk(KERN_CONT "OK\n");
-        return 0;
-}
-late_initcall(ftrace_test_event_filter);
-#endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
deleted file mode 100644
index bfd4dba0d60..00000000000
--- a/kernel/trace/trace_events_filter_test.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM test
-#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_TEST_H
-#include <linux/tracepoint.h>
-TRACE_EVENT(ftrace_test_filter,
-        TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h),
-        TP_ARGS(a, b, c, d, e, f, g, h),
-        TP_STRUCT__entry(
-                __field(int, a)
-                __field(int, b)
-                __field(int, c)
-                __field(int, d)
-                __field(int, e)
-                __field(int, f)
-                __field(int, g)
-                __field(int, h)
-        ),
-        TP_fast_assign(
-                __entry->a = a;
-                __entry->b = b;
-                __entry->c = c;
-                __entry->d = d;
-                __entry->e = e;
-                __entry->f = f;
-                __entry->g = g;
-                __entry->h = h;
-        ),
-        TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d",
-                  __entry->a, __entry->b, __entry->c, __entry->d,
-                  __entry->e, __entry->f, __entry->g, __entry->h)
-);
-#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */
-#undef TRACE_INCLUDE_PATH
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE trace_events_filter_test
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e039906b037..bbeec31e0ae 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -18,16 +18,6 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM    ftrace
-/*
- * The FTRACE_ENTRY_REG macro allows ftrace entry to define register
- * function and thus become accesible via perf.
- */
-#undef FTRACE_ENTRY_REG
-#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
-                         filter, regfn) \
-        FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
-                     filter)
 /* not needed for this file */
 #undef __field_struct
 #define __field_struct(type, item)
@@ -54,22 +44,21 @@
 #define F_printk(fmt, args...) fmt, args
 #undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)     \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)     \
-struct ____ftrace_##name {                                              \
+struct ____ftrace_##name {                                      \
-        tstruct                                                         \
+        tstruct                                                 \
-};                                                                      \
+};                                                              \
-static void __always_unused ____ftrace_check_##name(void)               \
+static void __always_unused ____ftrace_check_##name(void)       \
-{                                                                       \
+{                                                               \
-        struct ____ftrace_##name *__entry = NULL;                       \
+        struct ____ftrace_##name *__entry = NULL;               \
-                                                                        \
+                                                                \
-        /* force compile-time check on F_printk() */                    \
+        /* force compile-time check on F_printk() */            \
-        printk(print);                                                  \
+        printk(print);                                          \
 }
 #undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \
+#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
-        FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+        FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
-                     filter)
 #include "trace_entries.h"
@@ -78,7 +67,7 @@ static void __always_unused ____ftrace_check_##name(void)		\
        ret = trace_define_field(event_call, #type, #item,              \
                                 offsetof(typeof(field), item),         \
                                 sizeof(field.item),                    \
-                                 is_signed_type(type), filter_type);    \
+                                 is_signed_type(type), FILTER_OTHER);   \
        if (ret)                                                        \
                return ret;
@@ -88,7 +77,7 @@ static void __always_unused ____ftrace_check_##name(void)		\
                                 offsetof(typeof(field),                \
                                          container.item),              \
                                 sizeof(field.container.item),          \
-                                 is_signed_type(type), filter_type);    \
+                                 is_signed_type(type), FILTER_OTHER);   \
        if (ret)                                                        \
                return ret;
@@ -102,7 +91,7 @@ static void __always_unused ____ftrace_check_##name(void)		\
                ret = trace_define_field(event_call, event_storage, #item, \
                                 offsetof(typeof(field), item),         \
                                 sizeof(field.item),                    \
-                                 is_signed_type(type), filter_type);    \
+                                 is_signed_type(type), FILTER_OTHER);   \
                mutex_unlock(&event_storage_mutex);                     \
                if (ret)                                                \
                        return ret;                                     \
@@ -115,7 +104,7 @@ static void __always_unused ____ftrace_check_##name(void)		\
                                 offsetof(typeof(field),                \
                                          container.item),              \
                                 sizeof(field.container.item),          \
-                                 is_signed_type(type), filter_type);    \
+                                 is_signed_type(type), FILTER_OTHER);   \
        if (ret)                                                        \
                return ret;
@@ -123,18 +112,17 @@ static void __always_unused ____ftrace_check_##name(void)		\
 #define __dynamic_array(type, item)                                     \
        ret = trace_define_field(event_call, #type, #item,              \
                                 offsetof(typeof(field), item),         \
-                                 0, is_signed_type(type), filter_type);\
+                                 0, is_signed_type(type), FILTER_OTHER);\
        if (ret)                                                        \
                return ret;
 #undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)     \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)             \
 int                                                                     \
 ftrace_define_fields_##name(struct ftrace_event_call *event_call)       \
 {                                                                       \
        struct struct_name field;                                       \
        int ret;                                                        \
-        int filter_type = filter;                                       \
                                                                        \
        tstruct;                                                        \
                                                                        \
@@ -162,17 +150,15 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 #define __dynamic_array(type, item)
 #undef F_printk
-#define F_printk(fmt, args...) __stringify(fmt) ", "  __stringify(args)
+#define F_printk(fmt, args...) #fmt ", "  __stringify(args)
-#undef FTRACE_ENTRY_REG
+#undef FTRACE_ENTRY
-#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print)          \
-                         regfn)                                         \
                                                                        \
 struct ftrace_event_class event_class_ftrace_##call = {                 \
        .system                 = __stringify(TRACE_SYSTEM),            \
        .define_fields          = ftrace_define_fields_##call,          \
        .fields                 = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
-        .reg                    = regfn,                                \
 };                                                                      \
                                                                        \
 struct ftrace_event_call __used event_##call = {                        \
@@ -180,19 +166,8 @@ struct ftrace_event_call __used event_##call = {			\
        .event.type             = etype,                                \
        .class                  = &event_class_ftrace_##call,           \
        .print_fmt              = print,                                \
-        .flags                  = TRACE_EVENT_FL_IGNORE_ENABLE,         \
 };                                                                      \
 struct ftrace_event_call __used                                         \
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
-#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter)  \
-        FTRACE_ENTRY_REG(call, struct_name, etype,                      \
-                         PARAMS(tstruct), PARAMS(print), filter, NULL)
-int ftrace_event_is_function(struct ftrace_event_call *call)
-{
-        return call == &event_function;
-}
 #include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8e3ad8082ab..c7b0c6a7db0 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -7,7 +7,7 @@
 * Based on code from the latency_tracer, that is:
 *
 *  Copyright (C) 2004-2006 Ingo Molnar
- *  Copyright (C) 2004 Nadia Yvette Chambers
+ *  Copyright (C) 2004 William Lee Irwin III
 */
 #include <linux/ring_buffer.h>
 #include <linux/debugfs.h>
@@ -48,8 +48,7 @@ static void function_trace_start(struct trace_array *tr)
 }
 static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
+function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
-                                 struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
        struct trace_array *tr = func_trace;
        struct trace_array_cpu *data;
@@ -75,17 +74,8 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
        preempt_enable_notrace();
 }
-/* Our option */
-enum {
-        TRACE_FUNC_OPT_STACK    = 0x1,
-};
-static struct tracer_flags func_flags;
 static void
-function_trace_call(unsigned long ip, unsigned long parent_ip,
+function_trace_call(unsigned long ip, unsigned long parent_ip)
-                    struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
        struct trace_array *tr = func_trace;
        struct trace_array_cpu *data;
@@ -116,8 +106,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 }
 static void
-function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
-                          struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
        struct trace_array *tr = func_trace;
        struct trace_array_cpu *data;
@@ -160,13 +149,18 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
 static struct ftrace_ops trace_ops __read_mostly =
 {
        .func = function_trace_call,
-        .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 static struct ftrace_ops trace_stack_ops __read_mostly =
 {
        .func = function_stack_trace_call,
-        .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
+        .flags = FTRACE_OPS_FL_GLOBAL,
+};
+/* Our two options */
+enum {
+        TRACE_FUNC_OPT_STACK = 0x1,
 };
 static struct tracer_opt func_opts[] = {
@@ -210,11 +204,10 @@ static void tracing_stop_function_trace(void)
 static int func_set_flag(u32 old_flags, u32 bit, int set)
 {
-        switch (bit) {
+        if (bit == TRACE_FUNC_OPT_STACK) {
-        case TRACE_FUNC_OPT_STACK:
                /* do nothing if already set */
                if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
-                        break;
+                        return 0;
                if (set) {
                        unregister_ftrace_function(&trace_ops);
@@ -224,12 +217,10 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
                        register_ftrace_function(&trace_ops);
                }
-                break;
+                return 0;
-        default:
-                return -EINVAL;
        }
-        return 0;
+        return -EINVAL;
 }
 static struct tracer function_trace __read_mostly =
@@ -366,7 +357,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
         * We use the callback data field (which is a pointer)
         * as our counter.
         */
-        ret = kstrtoul(number, 0, (unsigned long *)&count);
+        ret = strict_strtoul(number, 0, (unsigned long *)&count);
        if (ret)
                return ret;
@@ -411,4 +402,5 @@ static __init int init_function_trace(void)
        init_func_cmd_traceon();
        return register_tracer(&function_trace);
 }
-core_initcall(init_function_trace);
+device_initcall(init_function_trace);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4edb4b74eb7..a7d2a4c653d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -143,7 +143,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
                return;
        }
-#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY)
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
        /*
         * The arch may choose to record the frame pointer used
         * and check it here to make sure that it is what we expect it
@@ -154,9 +154,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
         *
         * Currently, x86_32 with optimize for size (-Os) makes the latest
         * gcc do the above.
-         *
-         * Note, -mfentry does not use frame pointers, and this test
-         *  is not needed if CC_USING_FENTRY is set.
         */
        if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
                ftrace_graph_stop();
@@ -223,7 +220,7 @@ int __trace_graph_entry(struct trace_array *tr,
        entry   = ring_buffer_event_data(event);
        entry->graph_ent                        = *trace;
        if (!filter_current_check_discard(buffer, call, entry, event))
-                __buffer_unlock_commit(buffer, event);
+                ring_buffer_unlock_commit(buffer, event);
        return 1;
 }
@@ -327,7 +324,7 @@ void __trace_graph_return(struct trace_array *tr,
        entry   = ring_buffer_event_data(event);
        entry->ret                              = *trace;
        if (!filter_current_check_discard(buffer, call, entry, event))
-                __buffer_unlock_commit(buffer, event);
+                ring_buffer_unlock_commit(buffer, event);
 }
 void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -541,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter,
                next = &data->ret;
        } else {
-                ring_iter = trace_buffer_iter(iter, iter->cpu);
+                ring_iter = iter->buffer_iter[iter->cpu];
                /* First peek to compare current entry and the next one */
                if (ring_iter)
@@ -1474,4 +1471,4 @@ static __init int init_graph_trace(void)
        return register_tracer(&graph_trace);
 }
-core_initcall(init_graph_trace);
+device_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 713a2cac488..667aa8cc0cf 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -7,7 +7,7 @@
 * From code in the latency_tracer, that is:
 *
 *  Copyright (C) 2004-2006 Ingo Molnar
- *  Copyright (C) 2004 Nadia Yvette Chambers
+ *  Copyright (C) 2004 William Lee Irwin III
 */
 #include <linux/kallsyms.h>
 #include <linux/debugfs.h>
@@ -23,7 +23,7 @@ static int				tracer_enabled __read_mostly;
 static DEFINE_PER_CPU(int, tracing_cpu);
-static DEFINE_RAW_SPINLOCK(max_trace_lock);
+static DEFINE_SPINLOCK(max_trace_lock);
 enum {
        TRACER_IRQS_OFF         = (1 << 1),
@@ -136,8 +136,7 @@ static int func_prolog_dec(struct trace_array *tr,
 * irqsoff uses its own tracer function to keep the overhead down:
 */
 static void
-irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
-                    struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
        struct trace_array *tr = irqsoff_trace;
        struct trace_array_cpu *data;
@@ -154,7 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
 static struct ftrace_ops trace_ops __read_mostly =
 {
        .func = irqsoff_tracer_call,
-        .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 #endif /* CONFIG_FUNCTION_TRACER */
@@ -281,20 +280,9 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
 }
 static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
+static void irqsoff_print_header(struct seq_file *s) { }
 static void irqsoff_trace_open(struct trace_iterator *iter) { }
 static void irqsoff_trace_close(struct trace_iterator *iter) { }
-#ifdef CONFIG_FUNCTION_TRACER
-static void irqsoff_print_header(struct seq_file *s)
-{
-        trace_default_header(s);
-}
-#else
-static void irqsoff_print_header(struct seq_file *s)
-{
-        trace_latency_header(s);
-}
-#endif /* CONFIG_FUNCTION_TRACER */
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 /*
@@ -333,7 +321,7 @@ check_critical_timing(struct trace_array *tr,
        if (!report_latency(delta))
                goto out;
-        raw_spin_lock_irqsave(&max_trace_lock, flags);
+        spin_lock_irqsave(&max_trace_lock, flags);
        /* check if we are still the max latency */
        if (!report_latency(delta))
@@ -356,7 +344,7 @@ check_critical_timing(struct trace_array *tr,
        max_sequence++;
 out_unlock:
-        raw_spin_unlock_irqrestore(&max_trace_lock, flags);
+        spin_unlock_irqrestore(&max_trace_lock, flags);
 out:
        data->critical_sequence = max_sequence;
@@ -517,13 +505,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
 #ifdef CONFIG_PREEMPT_TRACER
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-        if (preempt_trace() && !irq_trace())
+        if (preempt_trace())
                stop_critical_timing(a0, a1);
 }
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-        if (preempt_trace() && !irq_trace())
+        if (preempt_trace())
                start_critical_timing(a0, a1);
 }
 #endif /* CONFIG_PREEMPT_TRACER */
@@ -604,7 +592,7 @@ static struct tracer irqsoff_tracer __read_mostly =
        .reset          = irqsoff_tracer_reset,
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
-        .print_max      = true,
+        .print_max      = 1,
        .print_header   = irqsoff_print_header,
        .print_line     = irqsoff_print_line,
        .flags          = &tracer_flags,
@@ -614,7 +602,7 @@ static struct tracer irqsoff_tracer __read_mostly =
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
-        .use_max_tr     = true,
+        .use_max_tr     = 1,
 };
 # define register_irqsoff(trace) register_tracer(&trace)
 #else
@@ -637,7 +625,7 @@ static struct tracer preemptoff_tracer __read_mostly =
        .reset          = irqsoff_tracer_reset,
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
-        .print_max      = true,
+        .print_max      = 1,
        .print_header   = irqsoff_print_header,
        .print_line     = irqsoff_print_line,
        .flags          = &tracer_flags,
@@ -647,7 +635,7 @@ static struct tracer preemptoff_tracer __read_mostly =
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
-        .use_max_tr     = true,
+        .use_max_tr     = 1,
 };
 # define register_preemptoff(trace) register_tracer(&trace)
 #else
@@ -672,7 +660,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
        .reset          = irqsoff_tracer_reset,
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
-        .print_max      = true,
+        .print_max      = 1,
        .print_header   = irqsoff_print_header,
        .print_line     = irqsoff_print_line,
        .flags          = &tracer_flags,
@@ -682,7 +670,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
-        .use_max_tr     = true,
+        .use_max_tr     = 1,
 };
 # define register_preemptirqsoff(trace) register_tracer(&trace)
@@ -698,4 +686,4 @@ __init static int init_irqsoff_tracer(void)
        return 0;
 }
-core_initcall(init_irqsoff_tracer);
+device_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1865d5f7653..00d527c945a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -19,15 +19,547 @@
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/ptrace.h>
+#include <linux/perf_event.h>
+#include <linux/stringify.h>
+#include <linux/limits.h>
+#include <asm/bitsperlong.h>
+#include "trace.h"
+#include "trace_output.h"
+#define MAX_TRACE_ARGS 128
+#define MAX_ARGSTR_LEN 63
+#define MAX_EVENT_NAME_LEN 64
+#define MAX_STRING_SIZE PATH_MAX
+#define KPROBE_EVENT_SYSTEM "kprobes"
-#include "trace_probe.h"
+/* Reserved field names */
+#define FIELD_STRING_IP "__probe_ip"
+#define FIELD_STRING_RETIP "__probe_ret_ip"
+#define FIELD_STRING_FUNC "__probe_func"
+const char *reserved_field_names[] = {
+        "common_type",
+        "common_flags",
+        "common_preempt_count",
+        "common_pid",
+        "common_tgid",
+        FIELD_STRING_IP,
+        FIELD_STRING_RETIP,
+        FIELD_STRING_FUNC,
+};
-#define KPROBE_EVENT_SYSTEM "kprobes"
+/* Printing function type */
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
+                                 void *);
+#define PRINT_TYPE_FUNC_NAME(type)      print_type_##type
+#define PRINT_TYPE_FMT_NAME(type)       print_type_format_##type
+/* Printing  in basic type function template */
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast)                   \
+static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,    \
+                                                const char *name,       \
+                                                void *data, void *ent)\
+{                                                                       \
+        return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
+}                                                                       \
+static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
+/* data_rloc: data relative location, compatible with u32 */
+#define make_data_rloc(len, roffs)      \
+        (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
+#define get_rloc_len(dl)        ((u32)(dl) >> 16)
+#define get_rloc_offs(dl)       ((u32)(dl) & 0xffff)
+static inline void *get_rloc_data(u32 *dl)
+{
+        return (u8 *)dl + get_rloc_offs(*dl);
+}
+/* For data_loc conversion */
+static inline void *get_loc_data(u32 *dl, void *ent)
+{
+        return (u8 *)ent + get_rloc_offs(*dl);
+}
+/*
+ * Convert data_rloc to data_loc:
+ *  data_rloc stores the offset from data_rloc itself, but data_loc
+ *  stores the offset from event entry.
+ */
+#define convert_rloc_to_loc(dl, offs)   ((u32)(dl) + (offs))
+/* For defining macros, define string/string_size types */
+typedef u32 string;
+typedef u32 string_size;
+/* Print type function for string type */
+static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
+                                                  const char *name,
+                                                  void *data, void *ent)
+{
+        int len = *(u32 *)data >> 16;
+        if (!len)
+                return trace_seq_printf(s, " %s=(fault)", name);
+        else
+                return trace_seq_printf(s, " %s=\"%s\"", name,
+                                        (const char *)get_loc_data(data, ent));
+}
+static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
+/* Data fetch function type */
+typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
+struct fetch_param {
+        fetch_func_t    fn;
+        void *data;
+};
+static __kprobes void call_fetch(struct fetch_param *fprm,
+                                 struct pt_regs *regs, void *dest)
+{
+        return fprm->fn(regs, fprm->data, dest);
+}
+#define FETCH_FUNC_NAME(method, type)   fetch_##method##_##type
+/*
+ * Define macro for basic types - we don't need to define s* types, because
+ * we have to care only about bitwidth at recording time.
+ */
+#define DEFINE_BASIC_FETCH_FUNCS(method) \
+DEFINE_FETCH_##method(u8)               \
+DEFINE_FETCH_##method(u16)              \
+DEFINE_FETCH_##method(u32)              \
+DEFINE_FETCH_##method(u64)
+#define CHECK_FETCH_FUNCS(method, fn)                   \
+        (((FETCH_FUNC_NAME(method, u8) == fn) ||        \
+          (FETCH_FUNC_NAME(method, u16) == fn) ||       \
+          (FETCH_FUNC_NAME(method, u32) == fn) ||       \
+          (FETCH_FUNC_NAME(method, u64) == fn) ||       \
+          (FETCH_FUNC_NAME(method, string) == fn) ||    \
+          (FETCH_FUNC_NAME(method, string_size) == fn)) \
+         && (fn != NULL))
+/* Data fetch function templates */
+#define DEFINE_FETCH_reg(type)                                          \
+static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,  \
+                                        void *offset, void *dest)       \
+{                                                                       \
+        *(type *)dest = (type)regs_get_register(regs,                   \
+                                (unsigned int)((unsigned long)offset)); \
+}
+DEFINE_BASIC_FETCH_FUNCS(reg)
+/* No string on the register */
+#define fetch_reg_string NULL
+#define fetch_reg_string_size NULL
+#define DEFINE_FETCH_stack(type)                                        \
+static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+                                          void *offset, void *dest)     \
+{                                                                       \
+        *(type *)dest = (type)regs_get_kernel_stack_nth(regs,           \
+                                (unsigned int)((unsigned long)offset)); \
+}
+DEFINE_BASIC_FETCH_FUNCS(stack)
+/* No string on the stack entry */
+#define fetch_stack_string NULL
+#define fetch_stack_string_size NULL
+#define DEFINE_FETCH_retval(type)                                       \
+static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
+                                          void *dummy, void *dest)      \
+{                                                                       \
+        *(type *)dest = (type)regs_return_value(regs);                  \
+}
+DEFINE_BASIC_FETCH_FUNCS(retval)
+/* No string on the retval */
+#define fetch_retval_string NULL
+#define fetch_retval_string_size NULL
+#define DEFINE_FETCH_memory(type)                                       \
+static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+                                          void *addr, void *dest)       \
+{                                                                       \
+        type retval;                                                    \
+        if (probe_kernel_address(addr, retval))                         \
+                *(type *)dest = 0;                                      \
+        else                                                            \
+                *(type *)dest = retval;                                 \
+}
+DEFINE_BASIC_FETCH_FUNCS(memory)
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
+ * length and relative data location.
+ */
+static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+                                                      void *addr, void *dest)
+{
+        long ret;
+        int maxlen = get_rloc_len(*(u32 *)dest);
+        u8 *dst = get_rloc_data(dest);
+        u8 *src = addr;
+        mm_segment_t old_fs = get_fs();
+        if (!maxlen)
+                return;
+        /*
+         * Try to get string again, since the string can be changed while
+         * probing.
+         */
+        set_fs(KERNEL_DS);
+        pagefault_disable();
+        do
+                ret = __copy_from_user_inatomic(dst++, src++, 1);
+        while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
+        dst[-1] = '\0';
+        pagefault_enable();
+        set_fs(old_fs);
+        if (ret < 0) {  /* Failed to fetch string */
+                ((u8 *)get_rloc_data(dest))[0] = '\0';
+                *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
+        } else
+                *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
+                                              get_rloc_offs(*(u32 *)dest));
+}
+/* Return the length of string -- including null terminal byte */
+static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+                                                        void *addr, void *dest)
+{
+        int ret, len = 0;
+        u8 c;
+        mm_segment_t old_fs = get_fs();
+        set_fs(KERNEL_DS);
+        pagefault_disable();
+        do {
+                ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
+                len++;
+        } while (c && ret == 0 && len < MAX_STRING_SIZE);
+        pagefault_enable();
+        set_fs(old_fs);
+        if (ret < 0)    /* Failed to check the length */
+                *(u32 *)dest = 0;
+        else
+                *(u32 *)dest = len;
+}
+/* Memory fetching by symbol */
+struct symbol_cache {
+        char *symbol;
+        long offset;
+        unsigned long addr;
+};
+static unsigned long update_symbol_cache(struct symbol_cache *sc)
+{
+        sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+        if (sc->addr)
+                sc->addr += sc->offset;
+        return sc->addr;
+}
+static void free_symbol_cache(struct symbol_cache *sc)
+{
+        kfree(sc->symbol);
+        kfree(sc);
+}
+static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+        struct symbol_cache *sc;
+        if (!sym || strlen(sym) == 0)
+                return NULL;
+        sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+        if (!sc)
+                return NULL;
+        sc->symbol = kstrdup(sym, GFP_KERNEL);
+        if (!sc->symbol) {
+                kfree(sc);
+                return NULL;
+        }
+        sc->offset = offset;
+        update_symbol_cache(sc);
+        return sc;
+}
+#define DEFINE_FETCH_symbol(type)                                       \
+static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
+                                          void *data, void *dest)       \
+{                                                                       \
+        struct symbol_cache *sc = data;                                 \
+        if (sc->addr)                                                   \
+                fetch_memory_##type(regs, (void *)sc->addr, dest);      \
+        else                                                            \
+                *(type *)dest = 0;                                      \
+}
+DEFINE_BASIC_FETCH_FUNCS(symbol)
+DEFINE_FETCH_symbol(string)
+DEFINE_FETCH_symbol(string_size)
+/* Dereference memory access function */
+struct deref_fetch_param {
+        struct fetch_param orig;
+        long offset;
+};
+#define DEFINE_FETCH_deref(type)                                        \
+static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
+                                            void *data, void *dest)     \
+{                                                                       \
+        struct deref_fetch_param *dprm = data;                          \
+        unsigned long addr;                                             \
+        call_fetch(&dprm->orig, regs, &addr);                           \
+        if (addr) {                                                     \
+                addr += dprm->offset;                                   \
+                fetch_memory_##type(regs, (void *)addr, dest);          \
+        } else                                                          \
+                *(type *)dest = 0;                                      \
+}
+DEFINE_BASIC_FETCH_FUNCS(deref)
+DEFINE_FETCH_deref(string)
+DEFINE_FETCH_deref(string_size)
+static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
+{
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                update_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                update_symbol_cache(data->orig.data);
+}
+static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
+{
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                free_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                free_symbol_cache(data->orig.data);
+        kfree(data);
+}
+/* Bitfield fetch function */
+struct bitfield_fetch_param {
+        struct fetch_param orig;
+        unsigned char hi_shift;
+        unsigned char low_shift;
+};
+#define DEFINE_FETCH_bitfield(type)                                     \
+static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
+                                            void *data, void *dest)     \
+{                                                                       \
+        struct bitfield_fetch_param *bprm = data;                       \
+        type buf = 0;                                                   \
+        call_fetch(&bprm->orig, regs, &buf);                            \
+        if (buf) {                                                      \
+                buf <<= bprm->hi_shift;                                 \
+                buf >>= bprm->low_shift;                                \
+        }                                                               \
+        *(type *)dest = buf;                                            \
+}
+DEFINE_BASIC_FETCH_FUNCS(bitfield)
+#define fetch_bitfield_string NULL
+#define fetch_bitfield_string_size NULL
+static __kprobes void
+update_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+        /*
+         * Don't check the bitfield itself, because this must be the
+         * last fetch function.
+         */
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                update_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                update_symbol_cache(data->orig.data);
+}
+static __kprobes void
+free_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+        /*
+         * Don't check the bitfield itself, because this must be the
+         * last fetch function.
+         */
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                free_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                free_symbol_cache(data->orig.data);
+        kfree(data);
+}
+/* Default (unsigned long) fetch type */
+#define __DEFAULT_FETCH_TYPE(t) u##t
+#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
+#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
+#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
+/* Fetch types */
+enum {
+        FETCH_MTD_reg = 0,
+        FETCH_MTD_stack,
+        FETCH_MTD_retval,
+        FETCH_MTD_memory,
+        FETCH_MTD_symbol,
+        FETCH_MTD_deref,
+        FETCH_MTD_bitfield,
+        FETCH_MTD_END,
+};
+#define ASSIGN_FETCH_FUNC(method, type) \
+        [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
+        {.name = _name,                         \
+         .size = _size,                                 \
+         .is_signed = sign,                             \
+         .print = PRINT_TYPE_FUNC_NAME(ptype),          \
+         .fmt = PRINT_TYPE_FMT_NAME(ptype),             \
+         .fmttype = _fmttype,                           \
+         .fetch = {                                     \
+ASSIGN_FETCH_FUNC(reg, ftype),                          \
+ASSIGN_FETCH_FUNC(stack, ftype),                        \
+ASSIGN_FETCH_FUNC(retval, ftype),                       \
+ASSIGN_FETCH_FUNC(memory, ftype),                       \
+ASSIGN_FETCH_FUNC(symbol, ftype),                       \
+ASSIGN_FETCH_FUNC(deref, ftype),                        \
+ASSIGN_FETCH_FUNC(bitfield, ftype),                     \
+          }                                             \
+        }
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)                   \
+        __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+#define FETCH_TYPE_STRING 0
+#define FETCH_TYPE_STRSIZE 1
+/* Fetch type information table */
+static const struct fetch_type {
+        const char      *name;          /* Name of type */
+        size_t          size;           /* Byte size of type */
+        int             is_signed;      /* Signed flag */
+        print_type_func_t       print;  /* Print functions */
+        const char      *fmt;           /* Fromat string */
+        const char      *fmttype;       /* Name in format file */
+        /* Fetch functions */
+        fetch_func_t    fetch[FETCH_MTD_END];
+} fetch_type_table[] = {
+        /* Special types */
+        [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
+                                        sizeof(u32), 1, "__data_loc char[]"),
+        [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
+                                        string_size, sizeof(u32), 0, "u32"),
+        /* Basic types */
+        ASSIGN_FETCH_TYPE(u8,  u8,  0),
+        ASSIGN_FETCH_TYPE(u16, u16, 0),
+        ASSIGN_FETCH_TYPE(u32, u32, 0),
+        ASSIGN_FETCH_TYPE(u64, u64, 0),
+        ASSIGN_FETCH_TYPE(s8,  u8,  1),
+        ASSIGN_FETCH_TYPE(s16, u16, 1),
+        ASSIGN_FETCH_TYPE(s32, u32, 1),
+        ASSIGN_FETCH_TYPE(s64, u64, 1),
+};
+static const struct fetch_type *find_fetch_type(const char *type)
+{
+        int i;
+        if (!type)
+                type = DEFAULT_FETCH_TYPE_STR;
+        /* Special case: bitfield */
+        if (*type == 'b') {
+                unsigned long bs;
+                type = strchr(type, '/');
+                if (!type)
+                        goto fail;
+                type++;
+                if (strict_strtoul(type, 0, &bs))
+                        goto fail;
+                switch (bs) {
+                case 8:
+                        return find_fetch_type("u8");
+                case 16:
+                        return find_fetch_type("u16");
+                case 32:
+                        return find_fetch_type("u32");
+                case 64:
+                        return find_fetch_type("u64");
+                default:
+                        goto fail;
+                }
+        }
+        for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
+                if (strcmp(type, fetch_type_table[i].name) == 0)
+                        return &fetch_type_table[i];
+fail:
+        return NULL;
+}
+/* Special function : only accept unsigned long */
+static __kprobes void fetch_stack_address(struct pt_regs *regs,
+                                          void *dummy, void *dest)
+{
+        *(unsigned long *)dest = kernel_stack_pointer(regs);
+}
+static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
+                                            fetch_func_t orig_fn)
+{
+        int i;
+        if (type != &fetch_type_table[FETCH_TYPE_STRING])
+                return NULL;    /* Only string type needs size function */
+        for (i = 0; i < FETCH_MTD_END; i++)
+                if (type->fetch[i] == orig_fn)
+                        return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
+        WARN_ON(1);     /* This should not happen */
+        return NULL;
+}
 /**
 * Kprobe event core functions
 */
+struct probe_arg {
+        struct fetch_param      fetch;
+        struct fetch_param      fetch_size;
+        unsigned int            offset; /* Offset from argument entry */
+        const char              *name;  /* Name of this argument */
+        const char              *comm;  /* Command of this argument */
+        const struct fetch_type *type;  /* Type of this argument */
+};
+/* Flags for trace_probe */
+#define TP_FLAG_TRACE   1
+#define TP_FLAG_PROFILE 2
+#define TP_FLAG_REGISTERED 4
 struct trace_probe {
        struct list_head        list;
        struct kretprobe        rp;     /* Use rp.kp for kprobe use */
@@ -99,6 +631,18 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
 static int kretprobe_dispatcher(struct kretprobe_instance *ri,
                                struct pt_regs *regs);
+/* Check the name is good for event/group/fields */
+static int is_good_name(const char *name)
+{
+        if (!isalpha(*name) && *name != '_')
+                return 0;
+        while (*++name != '\0') {
+                if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+                        return 0;
+        }
+        return 1;
+}
 /*
 * Allocate new trace_probe and initialize it (including kprobes).
 */
@@ -107,7 +651,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
                                             void *addr,
                                             const char *symbol,
                                             unsigned long offs,
-                                             int nargs, bool is_return)
+                                             int nargs, int is_return)
 {
        struct trace_probe *tp;
        int ret = -ENOMEM;
@@ -158,12 +702,34 @@ error:
        return ERR_PTR(ret);
 }
+static void update_probe_arg(struct probe_arg *arg)
+{
+        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+                update_bitfield_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+                update_deref_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
+                update_symbol_cache(arg->fetch.data);
+}
+static void free_probe_arg(struct probe_arg *arg)
+{
+        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+                free_bitfield_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+                free_deref_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
+                free_symbol_cache(arg->fetch.data);
+        kfree(arg->name);
+        kfree(arg->comm);
+}
 static void free_trace_probe(struct trace_probe *tp)
 {
        int i;
        for (i = 0; i < tp->nr_args; i++)
-                traceprobe_free_probe_arg(&tp->args[i]);
+                free_probe_arg(&tp->args[i]);
        kfree(tp->call.class->system);
        kfree(tp->call.name);
@@ -221,7 +787,7 @@ static int __register_trace_probe(struct trace_probe *tp)
                return -EINVAL;
        for (i = 0; i < tp->nr_args; i++)
-                traceprobe_update_arg(&tp->args[i]);
+                update_probe_arg(&tp->args[i]);
        /* Set/clear disabled flag according to tp->flag */
        if (trace_probe_is_enabled(tp))
@@ -353,6 +919,227 @@ static struct notifier_block trace_probe_module_nb = {
        .priority = 1   /* Invoked after kprobe module callback */
 };
+/* Split symbol and offset. */
+static int split_symbol_offset(char *symbol, unsigned long *offset)
+{
+        char *tmp;
+        int ret;
+        if (!offset)
+                return -EINVAL;
+        tmp = strchr(symbol, '+');
+        if (tmp) {
+                /* skip sign because strict_strtol doesn't accept '+' */
+                ret = strict_strtoul(tmp + 1, 0, offset);
+                if (ret)
+                        return ret;
+                *tmp = '\0';
+        } else
+                *offset = 0;
+        return 0;
+}
+#define PARAM_MAX_ARGS 16
+#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
+static int parse_probe_vars(char *arg, const struct fetch_type *t,
+                            struct fetch_param *f, int is_return)
+{
+        int ret = 0;
+        unsigned long param;
+        if (strcmp(arg, "retval") == 0) {
+                if (is_return)
+                        f->fn = t->fetch[FETCH_MTD_retval];
+                else
+                        ret = -EINVAL;
+        } else if (strncmp(arg, "stack", 5) == 0) {
+                if (arg[5] == '\0') {
+                        if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
+                                f->fn = fetch_stack_address;
+                        else
+                                ret = -EINVAL;
+                } else if (isdigit(arg[5])) {
+                        ret = strict_strtoul(arg + 5, 10, &param);
+                        if (ret || param > PARAM_MAX_STACK)
+                                ret = -EINVAL;
+                        else {
+                                f->fn = t->fetch[FETCH_MTD_stack];
+                                f->data = (void *)param;
+                        }
+                } else
+                        ret = -EINVAL;
+        } else
+                ret = -EINVAL;
+        return ret;
+}
+/* Recursive argument parser */
+static int __parse_probe_arg(char *arg, const struct fetch_type *t,
+                             struct fetch_param *f, int is_return)
+{
+        int ret = 0;
+        unsigned long param;
+        long offset;
+        char *tmp;
+        switch (arg[0]) {
+        case '$':
+                ret = parse_probe_vars(arg + 1, t, f, is_return);
+                break;
+        case '%':       /* named register */
+                ret = regs_query_register_offset(arg + 1);
+                if (ret >= 0) {
+                        f->fn = t->fetch[FETCH_MTD_reg];
+                        f->data = (void *)(unsigned long)ret;
+                        ret = 0;
+                }
+                break;
+        case '@':       /* memory or symbol */
+                if (isdigit(arg[1])) {
+                        ret = strict_strtoul(arg + 1, 0, &param);
+                        if (ret)
+                                break;
+                        f->fn = t->fetch[FETCH_MTD_memory];
+                        f->data = (void *)param;
+                } else {
+                        ret = split_symbol_offset(arg + 1, &offset);
+                        if (ret)
+                                break;
+                        f->data = alloc_symbol_cache(arg + 1, offset);
+                        if (f->data)
+                                f->fn = t->fetch[FETCH_MTD_symbol];
+                }
+                break;
+        case '+':       /* deref memory */
+                arg++;  /* Skip '+', because strict_strtol() rejects it. */
+        case '-':
+                tmp = strchr(arg, '(');
+                if (!tmp)
+                        break;
+                *tmp = '\0';
+                ret = strict_strtol(arg, 0, &offset);
+                if (ret)
+                        break;
+                arg = tmp + 1;
+                tmp = strrchr(arg, ')');
+                if (tmp) {
+                        struct deref_fetch_param *dprm;
+                        const struct fetch_type *t2 = find_fetch_type(NULL);
+                        *tmp = '\0';
+                        dprm = kzalloc(sizeof(struct deref_fetch_param),
+                                       GFP_KERNEL);
+                        if (!dprm)
+                                return -ENOMEM;
+                        dprm->offset = offset;
+                        ret = __parse_probe_arg(arg, t2, &dprm->orig,
+                                                is_return);
+                        if (ret)
+                                kfree(dprm);
+                        else {
+                                f->fn = t->fetch[FETCH_MTD_deref];
+                                f->data = (void *)dprm;
+                        }
+                }
+                break;
+        }
+        if (!ret && !f->fn) {   /* Parsed, but do not find fetch method */
+                pr_info("%s type has no corresponding fetch method.\n",
+                        t->name);
+                ret = -EINVAL;
+        }
+        return ret;
+}
+#define BYTES_TO_BITS(nb)       ((BITS_PER_LONG * (nb)) / sizeof(long))
+/* Bitfield type needs to be parsed into a fetch function */
+static int __parse_bitfield_probe_arg(const char *bf,
+                                      const struct fetch_type *t,
+                                      struct fetch_param *f)
+{
+        struct bitfield_fetch_param *bprm;
+        unsigned long bw, bo;
+        char *tail;
+        if (*bf != 'b')
+                return 0;
+        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+        if (!bprm)
+                return -ENOMEM;
+        bprm->orig = *f;
+        f->fn = t->fetch[FETCH_MTD_bitfield];
+        f->data = (void *)bprm;
+        bw = simple_strtoul(bf + 1, &tail, 0);  /* Use simple one */
+        if (bw == 0 || *tail != '@')
+                return -EINVAL;
+        bf = tail + 1;
+        bo = simple_strtoul(bf, &tail, 0);
+        if (tail == bf || *tail != '/')
+                return -EINVAL;
+        bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
+        bprm->low_shift = bprm->hi_shift + bo;
+        return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
+}
+/* String length checking wrapper */
+static int parse_probe_arg(char *arg, struct trace_probe *tp,
+                           struct probe_arg *parg, int is_return)
+{
+        const char *t;
+        int ret;
+        if (strlen(arg) > MAX_ARGSTR_LEN) {
+                pr_info("Argument is too long.: %s\n",  arg);
+                return -ENOSPC;
+        }
+        parg->comm = kstrdup(arg, GFP_KERNEL);
+        if (!parg->comm) {
+                pr_info("Failed to allocate memory for command '%s'.\n", arg);
+                return -ENOMEM;
+        }
+        t = strchr(parg->comm, ':');
+        if (t) {
+                arg[t - parg->comm] = '\0';
+                t++;
+        }
+        parg->type = find_fetch_type(t);
+        if (!parg->type) {
+                pr_info("Unsupported type: %s\n", t);
+                return -EINVAL;
+        }
+        parg->offset = tp->size;
+        tp->size += parg->type->size;
+        ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+        if (ret >= 0 && t != NULL)
+                ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
+        if (ret >= 0) {
+                parg->fetch_size.fn = get_fetch_size_function(parg->type,
+                                                              parg->fetch.fn);
+                parg->fetch_size.data = parg->fetch.data;
+        }
+        return ret;
+}
+/* Return 1 if name is reserved or already used by another argument */
+static int conflict_field_name(const char *name,
+                               struct probe_arg *args, int narg)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
+                if (strcmp(reserved_field_names[i], name) == 0)
+                        return 1;
+        for (i = 0; i < narg; i++)
+                if (strcmp(args[i].name, name) == 0)
+                        return 1;
+        return 0;
+}
 static int create_trace_probe(int argc, char **argv)
 {
        /*
@@ -375,7 +1162,7 @@ static int create_trace_probe(int argc, char **argv)
         */
        struct trace_probe *tp;
        int i, ret = 0;
-        bool is_return = false, is_delete = false;
+        int is_return = 0, is_delete = 0;
        char *symbol = NULL, *event = NULL, *group = NULL;
        char *arg;
        unsigned long offset = 0;
@@ -384,11 +1171,11 @@ static int create_trace_probe(int argc, char **argv)
        /* argc must be >= 1 */
        if (argv[0][0] == 'p')
-                is_return = false;
+                is_return = 0;
        else if (argv[0][0] == 'r')
-                is_return = true;
+                is_return = 1;
        else if (argv[0][0] == '-')
-                is_delete = true;
+                is_delete = 1;
        else {
                pr_info("Probe definition must be started with 'p', 'r' or"
                        " '-'.\n");
@@ -444,7 +1231,7 @@ static int create_trace_probe(int argc, char **argv)
                        return -EINVAL;
                }
                /* an address specified */
-                ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
+                ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
                if (ret) {
                        pr_info("Failed to parse address.\n");
                        return ret;
@@ -453,7 +1240,7 @@ static int create_trace_probe(int argc, char **argv)
                /* a symbol specified */
                symbol = argv[1];
                /* TODO: support .init module functions */
-                ret = traceprobe_split_symbol_offset(symbol, &offset);
+                ret = split_symbol_offset(symbol, &offset);
                if (ret) {
                        pr_info("Failed to parse symbol.\n");
                        return ret;
@@ -515,8 +1302,7 @@ static int create_trace_probe(int argc, char **argv)
                        goto error;
                }
-                if (traceprobe_conflict_field_name(tp->args[i].name,
+                if (conflict_field_name(tp->args[i].name, tp->args, i)) {
-                                                        tp->args, i)) {
                        pr_info("Argument[%d] name '%s' conflicts with "
                                "another field.\n", i, argv[i]);
                        ret = -EINVAL;
@@ -524,8 +1310,7 @@ static int create_trace_probe(int argc, char **argv)
                }
                /* Parse fetch argument */
-                ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i],
+                ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
-                                                is_return, true);
                if (ret) {
                        pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
                        goto error;
@@ -627,11 +1412,70 @@ static int probes_open(struct inode *inode, struct file *file)
        return seq_open(file, &probes_seq_op);
 }
+static int command_trace_probe(const char *buf)
+{
+        char **argv;
+        int argc = 0, ret = 0;
+        argv = argv_split(GFP_KERNEL, buf, &argc);
+        if (!argv)
+                return -ENOMEM;
+        if (argc)
+                ret = create_trace_probe(argc, argv);
+        argv_free(argv);
+        return ret;
+}
+#define WRITE_BUFSIZE 4096
 static ssize_t probes_write(struct file *file, const char __user *buffer,
                            size_t count, loff_t *ppos)
 {
-        return traceprobe_probes_write(file, buffer, count, ppos,
+        char *kbuf, *tmp;
-                        create_trace_probe);
+        int ret;
+        size_t done;
+        size_t size;
+        kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+        if (!kbuf)
+                return -ENOMEM;
+        ret = done = 0;
+        while (done < count) {
+                size = count - done;
+                if (size >= WRITE_BUFSIZE)
+                        size = WRITE_BUFSIZE - 1;
+                if (copy_from_user(kbuf, buffer + done, size)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                kbuf[size] = '\0';
+                tmp = strchr(kbuf, '\n');
+                if (tmp) {
+                        *tmp = '\0';
+                        size = tmp - kbuf + 1;
+                } else if (done + size < count) {
+                        pr_warning("Line length is too long: "
+                                   "Should be less than %d.", WRITE_BUFSIZE);
+                        ret = -EINVAL;
+                        goto out;
+                }
+                done += size;
+                /* Remove comments */
+                tmp = strchr(kbuf, '#');
+                if (tmp)
+                        *tmp = '\0';
+                ret = command_trace_probe(kbuf);
+                if (ret)
+                        goto out;
+        }
+        ret = done;
+out:
+        kfree(kbuf);
+        return ret;
 }
 static const struct file_operations kprobe_events_ops = {
@@ -751,8 +1595,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        if (!filter_current_check_discard(buffer, call, entry, event))
-                trace_buffer_unlock_commit_regs(buffer, event,
+                trace_nowake_buffer_unlock_commit_regs(buffer, event,
-                                                irq_flags, pc, regs);
+                                                       irq_flags, pc, regs);
 }
 /* Kretprobe handler */
@@ -784,8 +1628,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        if (!filter_current_check_discard(buffer, call, entry, event))
-                trace_buffer_unlock_commit_regs(buffer, event,
+                trace_nowake_buffer_unlock_commit_regs(buffer, event,
-                                                irq_flags, pc, regs);
+                                                       irq_flags, pc, regs);
 }
 /* Event entry printers */
@@ -867,6 +1711,16 @@ partial:
        return TRACE_TYPE_PARTIAL_LINE;
 }
+#undef DEFINE_FIELD
+#define DEFINE_FIELD(type, item, name, is_signed)                       \
+        do {                                                            \
+                ret = trace_define_field(event_call, #type, name,       \
+                                         offsetof(typeof(field), item), \
+                                         sizeof(field.item), is_signed, \
+                                         FILTER_OTHER);                 \
+                if (ret)                                                \
+                        return ret;                                     \
+        } while (0)
 static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
@@ -1002,8 +1856,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx,
+        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
-                                        entry->ip, 1, regs, head, NULL);
 }
 /* Kretprobe profile handler */
@@ -1034,14 +1887,12 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx,
+        perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
-                                        entry->ret_ip, 1, regs, head, NULL);
 }
 #endif  /* CONFIG_PERF_EVENTS */
 static __kprobes
-int kprobe_register(struct ftrace_event_call *event,
+int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
-                    enum trace_reg type, void *data)
 {
        struct trace_probe *tp = (struct trace_probe *)event->data;
@@ -1058,11 +1909,6 @@ int kprobe_register(struct ftrace_event_call *event,
        case TRACE_REG_PERF_UNREGISTER:
                disable_trace_probe(tp, TP_FLAG_PROFILE);
                return 0;
-        case TRACE_REG_PERF_OPEN:
-        case TRACE_REG_PERF_CLOSE:
-        case TRACE_REG_PERF_ADD:
-        case TRACE_REG_PERF_DEL:
-                return 0;
 #endif
        }
        return 0;
@@ -1199,9 +2045,8 @@ static __init int kprobe_trace_self_tests_init(void)
        pr_info("Testing kprobe tracing: ");
-        ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
+        ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
-                                  "$stack $stack0 +0($stack)",
+                                  "$stack $stack0 +0($stack)");
-                                  create_trace_probe);
        if (WARN_ON_ONCE(ret)) {
                pr_warning("error on probing function entry.\n");
                warn++;
@@ -1215,8 +2060,8 @@ static __init int kprobe_trace_self_tests_init(void)
                        enable_trace_probe(tp, TP_FLAG_TRACE);
        }
-        ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
+        ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
-                                  "$retval", create_trace_probe);
+                                  "$retval");
        if (WARN_ON_ONCE(ret)) {
                pr_warning("error on probing function return.\n");
                warn++;
@@ -1250,13 +2095,13 @@ static __init int kprobe_trace_self_tests_init(void)
        } else
                disable_trace_probe(tp, TP_FLAG_TRACE);
-        ret = traceprobe_command("-:testprobe", create_trace_probe);
+        ret = command_trace_probe("-:testprobe");
        if (WARN_ON_ONCE(ret)) {
                pr_warning("error on deleting a probe.\n");
                warn++;
        }
-        ret = traceprobe_command("-:testprobe2", create_trace_probe);
+        ret = command_trace_probe("-:testprobe2");
        if (WARN_ON_ONCE(ret)) {
                pr_warning("error on deleting a probe.\n");
                warn++;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 194d79602dc..51999309a6c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
        return ret;
 }
-int trace_seq_path(struct trace_seq *s, const struct path *path)
+int trace_seq_path(struct trace_seq *s, struct path *path)
 {
        unsigned char *p;
@@ -300,7 +300,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
        unsigned long mask;
        const char *str;
        const char *ret = p->buffer + p->len;
-        int i, first = 1;
+        int i;
        for (i = 0;  flag_array[i].name && flags; i++) {
@@ -310,16 +310,14 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
                str = flag_array[i].name;
                flags &= ~mask;
-                if (!first && delim)
+                if (p->len && delim)
                        trace_seq_puts(p, delim);
-                else
-                        first = 0;
                trace_seq_puts(p, str);
        }
        /* check for left over flags */
        if (flags) {
-                if (!first && delim)
+                if (p->len && delim)
                        trace_seq_puts(p, delim);
                trace_seq_printf(p, "0x%lx", flags);
        }
@@ -346,7 +344,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
                break;
        }
-        if (ret == (const char *)(p->buffer + p->len))
+        if (!p->len)
                trace_seq_printf(p, "0x%lx", val);
                
        trace_seq_putc(p, 0);
@@ -372,7 +370,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
                break;
        }
-        if (ret == (const char *)(p->buffer + p->len))
+        if (!p->len)
                trace_seq_printf(p, "0x%llx", val);
        trace_seq_putc(p, 0);
@@ -610,113 +608,68 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
        return trace_print_lat_fmt(s, entry);
 }
-static unsigned long preempt_mark_thresh_us = 100;
+static unsigned long preempt_mark_thresh = 100;
 static int
-lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
+lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
+                    unsigned long rel_usecs)
 {
-        unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
+        return trace_seq_printf(s, " %4lldus%c: ", abs_usecs,
-        unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
+                                rel_usecs > preempt_mark_thresh ? '!' :
-        unsigned long long abs_ts = iter->ts - iter->tr->time_start;
+                                  rel_usecs > 1 ? '+' : ' ');
-        unsigned long long rel_ts = next_ts - iter->ts;
-        struct trace_seq *s = &iter->seq;
-        if (in_ns) {
-                abs_ts = ns2usecs(abs_ts);
-                rel_ts = ns2usecs(rel_ts);
-        }
-        if (verbose && in_ns) {
-                unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC);
-                unsigned long abs_msec = (unsigned long)abs_ts;
-                unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
-                unsigned long rel_msec = (unsigned long)rel_ts;
-                return trace_seq_printf(
-                                s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
-                                ns2usecs(iter->ts),
-                                abs_msec, abs_usec,
-                                rel_msec, rel_usec);
-        } else if (verbose && !in_ns) {
-                return trace_seq_printf(
-                                s, "[%016llx] %lld (+%lld): ",
-                                iter->ts, abs_ts, rel_ts);
-        } else if (!verbose && in_ns) {
-                return trace_seq_printf(
-                                s, " %4lldus%c: ",
-                                abs_ts,
-                                rel_ts > preempt_mark_thresh_us ? '!' :
-                                  rel_ts > 1 ? '+' : ' ');
-        } else { /* !verbose && !in_ns */
-                return trace_seq_printf(s, " %4lld: ", abs_ts);
-        }
 }
 int trace_print_context(struct trace_iterator *iter)
 {
        struct trace_seq *s = &iter->seq;
        struct trace_entry *entry = iter->ent;
-        unsigned long long t;
+        unsigned long long t = ns2usecs(iter->ts);
-        unsigned long secs, usec_rem;
+        unsigned long usec_rem = do_div(t, USEC_PER_SEC);
+        unsigned long secs = (unsigned long)t;
        char comm[TASK_COMM_LEN];
-        int ret;
        trace_find_cmdline(entry->pid, comm);
-        ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
+        return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
-                               comm, entry->pid, iter->cpu);
+                                comm, entry->pid, iter->cpu, secs, usec_rem);
-        if (!ret)
-                return 0;
-        if (trace_flags & TRACE_ITER_IRQ_INFO) {
-                ret = trace_print_lat_fmt(s, entry);
-                if (!ret)
-                        return 0;
-        }
-        if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
-                t = ns2usecs(iter->ts);
-                usec_rem = do_div(t, USEC_PER_SEC);
-                secs = (unsigned long)t;
-                return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
-        } else
-                return trace_seq_printf(s, " %12llu: ", iter->ts);
 }
 int trace_print_lat_context(struct trace_iterator *iter)
 {
        u64 next_ts;
        int ret;
-        /* trace_find_next_entry will reset ent_size */
-        int ent_size = iter->ent_size;
        struct trace_seq *s = &iter->seq;
        struct trace_entry *entry = iter->ent,
                           *next_entry = trace_find_next_entry(iter, NULL,
                                                               &next_ts);
        unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+        unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
-        /* Restore the original ent_size */
+        unsigned long rel_usecs;
-        iter->ent_size = ent_size;
        if (!next_entry)
                next_ts = iter->ts;
+        rel_usecs = ns2usecs(next_ts - iter->ts);
        if (verbose) {
                char comm[TASK_COMM_LEN];
                trace_find_cmdline(entry->pid, comm);
-                ret = trace_seq_printf(
+                ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]"
-                                s, "%16s %5d %3d %d %08x %08lx ",
+                                       " %ld.%03ldms (+%ld.%03ldms): ", comm,
-                                comm, entry->pid, iter->cpu, entry->flags,
+                                       entry->pid, iter->cpu, entry->flags,
-                                entry->preempt_count, iter->idx);
+                                       entry->preempt_count, iter->idx,
+                                       ns2usecs(iter->ts),
+                                       abs_usecs / USEC_PER_MSEC,
+                                       abs_usecs % USEC_PER_MSEC,
+                                       rel_usecs / USEC_PER_MSEC,
+                                       rel_usecs % USEC_PER_MSEC);
        } else {
                ret = lat_print_generic(s, entry, iter->cpu);
+                if (ret)
+                        ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
        }
-        if (ret)
-                ret = lat_print_timestamp(iter, next_ts);
        return ret;
 }
@@ -1353,4 +1306,4 @@ __init static int init_events(void)
        return 0;
 }
-early_initcall(init_events);
+device_initcall(init_events);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index a9077c1b4ad..1f06468a10d 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -51,10 +51,6 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
        const char **iter;
        char *fmt;
-        /* allocate the trace_printk per cpu buffers */
-        if (start != end)
-                trace_printk_init_buffers();
        mutex_lock(&btrace_mutex);
        for (iter = start; iter < end; iter++) {
                struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
@@ -63,19 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
                        continue;
                }
-                fmt = NULL;
                tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
-                if (tb_fmt) {
+                if (tb_fmt)
                        fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
-                        if (fmt) {
+                if (tb_fmt && fmt) {
-                                list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+                        list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
-                                strcpy(fmt, *iter);
+                        strcpy(fmt, *iter);
-                                tb_fmt->fmt = fmt;
+                        tb_fmt->fmt = fmt;
-                        } else
+                        *iter = tb_fmt->fmt;
-                                kfree(tb_fmt);
+                } else {
+                        kfree(tb_fmt);
+                        *iter = NULL;
                }
-                *iter = fmt;
        }
        mutex_unlock(&btrace_mutex);
 }
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
deleted file mode 100644
index 412e959709b..00000000000
--- a/kernel/trace/trace_probe.c
+++ /dev/null
@@ -1,839 +0,0 @@
-/*
- * Common code for probe-based Dynamic events.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- * This code was copied from kernel/trace/trace_kprobe.c written by
- * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
- *
- * Updates to make this generic:
- * Copyright (C) IBM Corporation, 2010-2011
- * Author:     Srikar Dronamraju
- */
-#include "trace_probe.h"
-const char *reserved_field_names[] = {
-        "common_type",
-        "common_flags",
-        "common_preempt_count",
-        "common_pid",
-        "common_tgid",
-        FIELD_STRING_IP,
-        FIELD_STRING_RETIP,
-        FIELD_STRING_FUNC,
-};
-/* Printing function type */
-#define PRINT_TYPE_FUNC_NAME(type)      print_type_##type
-#define PRINT_TYPE_FMT_NAME(type)       print_type_format_##type
-/* Printing  in basic type function template */
-#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast)                   \
-static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,    \
-                                                const char *name,       \
-                                                void *data, void *ent)\
-{                                                                       \
-        return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
-}                                                                       \
-static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
-DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
-static inline void *get_rloc_data(u32 *dl)
-{
-        return (u8 *)dl + get_rloc_offs(*dl);
-}
-/* For data_loc conversion */
-static inline void *get_loc_data(u32 *dl, void *ent)
-{
-        return (u8 *)ent + get_rloc_offs(*dl);
-}
-/* For defining macros, define string/string_size types */
-typedef u32 string;
-typedef u32 string_size;
-/* Print type function for string type */
-static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
-                                                  const char *name,
-                                                  void *data, void *ent)
-{
-        int len = *(u32 *)data >> 16;
-        if (!len)
-                return trace_seq_printf(s, " %s=(fault)", name);
-        else
-                return trace_seq_printf(s, " %s=\"%s\"", name,
-                                        (const char *)get_loc_data(data, ent));
-}
-static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
-#define FETCH_FUNC_NAME(method, type)   fetch_##method##_##type
-/*
- * Define macro for basic types - we don't need to define s* types, because
- * we have to care only about bitwidth at recording time.
- */
-#define DEFINE_BASIC_FETCH_FUNCS(method) \
-DEFINE_FETCH_##method(u8)               \
-DEFINE_FETCH_##method(u16)              \
-DEFINE_FETCH_##method(u32)              \
-DEFINE_FETCH_##method(u64)
-#define CHECK_FETCH_FUNCS(method, fn)                   \
-        (((FETCH_FUNC_NAME(method, u8) == fn) ||        \
-          (FETCH_FUNC_NAME(method, u16) == fn) ||       \
-          (FETCH_FUNC_NAME(method, u32) == fn) ||       \
-          (FETCH_FUNC_NAME(method, u64) == fn) ||       \
-          (FETCH_FUNC_NAME(method, string) == fn) ||    \
-          (FETCH_FUNC_NAME(method, string_size) == fn)) \
-         && (fn != NULL))
-/* Data fetch function templates */
-#define DEFINE_FETCH_reg(type)                                          \
-static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,  \
-                                        void *offset, void *dest)       \
-{                                                                       \
-        *(type *)dest = (type)regs_get_register(regs,                   \
-                                (unsigned int)((unsigned long)offset)); \
-}
-DEFINE_BASIC_FETCH_FUNCS(reg)
-/* No string on the register */
-#define fetch_reg_string        NULL
-#define fetch_reg_string_size   NULL
-#define DEFINE_FETCH_stack(type)                                        \
-static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
-                                          void *offset, void *dest)     \
-{                                                                       \
-        *(type *)dest = (type)regs_get_kernel_stack_nth(regs,           \
-                                (unsigned int)((unsigned long)offset)); \
-}
-DEFINE_BASIC_FETCH_FUNCS(stack)
-/* No string on the stack entry */
-#define fetch_stack_string      NULL
-#define fetch_stack_string_size NULL
-#define DEFINE_FETCH_retval(type)                                       \
-static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
-                                          void *dummy, void *dest)      \
-{                                                                       \
-        *(type *)dest = (type)regs_return_value(regs);                  \
-}
-DEFINE_BASIC_FETCH_FUNCS(retval)
-/* No string on the retval */
-#define fetch_retval_string             NULL
-#define fetch_retval_string_size        NULL
-#define DEFINE_FETCH_memory(type)                                       \
-static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
-                                          void *addr, void *dest)       \
-{                                                                       \
-        type retval;                                                    \
-        if (probe_kernel_address(addr, retval))                         \
-                *(type *)dest = 0;                                      \
-        else                                                            \
-                *(type *)dest = retval;                                 \
-}
-DEFINE_BASIC_FETCH_FUNCS(memory)
-/*
- * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
- * length and relative data location.
- */
-static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
-                                                      void *addr, void *dest)
-{
-        long ret;
-        int maxlen = get_rloc_len(*(u32 *)dest);
-        u8 *dst = get_rloc_data(dest);
-        u8 *src = addr;
-        mm_segment_t old_fs = get_fs();
-        if (!maxlen)
-                return;
-        /*
-         * Try to get string again, since the string can be changed while
-         * probing.
-         */
-        set_fs(KERNEL_DS);
-        pagefault_disable();
-        do
-                ret = __copy_from_user_inatomic(dst++, src++, 1);
-        while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
-        dst[-1] = '\0';
-        pagefault_enable();
-        set_fs(old_fs);
-        if (ret < 0) {  /* Failed to fetch string */
-                ((u8 *)get_rloc_data(dest))[0] = '\0';
-                *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
-        } else {
-                *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
-                                              get_rloc_offs(*(u32 *)dest));
-        }
-}
-/* Return the length of string -- including null terminal byte */
-static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
-                                                        void *addr, void *dest)
-{
-        mm_segment_t old_fs;
-        int ret, len = 0;
-        u8 c;
-        old_fs = get_fs();
-        set_fs(KERNEL_DS);
-        pagefault_disable();
-        do {
-                ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
-                len++;
-        } while (c && ret == 0 && len < MAX_STRING_SIZE);
-        pagefault_enable();
-        set_fs(old_fs);
-        if (ret < 0)    /* Failed to check the length */
-                *(u32 *)dest = 0;
-        else
-                *(u32 *)dest = len;
-}
-/* Memory fetching by symbol */
-struct symbol_cache {
-        char            *symbol;
-        long            offset;
-        unsigned long   addr;
-};
-static unsigned long update_symbol_cache(struct symbol_cache *sc)
-{
-        sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
-        if (sc->addr)
-                sc->addr += sc->offset;
-        return sc->addr;
-}
-static void free_symbol_cache(struct symbol_cache *sc)
-{
-        kfree(sc->symbol);
-        kfree(sc);
-}
-static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
-{
-        struct symbol_cache *sc;
-        if (!sym || strlen(sym) == 0)
-                return NULL;
-        sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
-        if (!sc)
-                return NULL;
-        sc->symbol = kstrdup(sym, GFP_KERNEL);
-        if (!sc->symbol) {
-                kfree(sc);
-                return NULL;
-        }
-        sc->offset = offset;
-        update_symbol_cache(sc);
-        return sc;
-}
-#define DEFINE_FETCH_symbol(type)                                       \
-static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
-                                          void *data, void *dest)       \
-{                                                                       \
-        struct symbol_cache *sc = data;                                 \
-        if (sc->addr)                                                   \
-                fetch_memory_##type(regs, (void *)sc->addr, dest);      \
-        else                                                            \
-                *(type *)dest = 0;                                      \
-}
-DEFINE_BASIC_FETCH_FUNCS(symbol)
-DEFINE_FETCH_symbol(string)
-DEFINE_FETCH_symbol(string_size)
-/* Dereference memory access function */
-struct deref_fetch_param {
-        struct fetch_param      orig;
-        long                    offset;
-};
-#define DEFINE_FETCH_deref(type)                                        \
-static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
-                                            void *data, void *dest)     \
-{                                                                       \
-        struct deref_fetch_param *dprm = data;                          \
-        unsigned long addr;                                             \
-        call_fetch(&dprm->orig, regs, &addr);                           \
-        if (addr) {                                                     \
-                addr += dprm->offset;                                   \
-                fetch_memory_##type(regs, (void *)addr, dest);          \
-        } else                                                          \
-                *(type *)dest = 0;                                      \
-}
-DEFINE_BASIC_FETCH_FUNCS(deref)
-DEFINE_FETCH_deref(string)
-DEFINE_FETCH_deref(string_size)
-static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
-{
-        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
-                update_deref_fetch_param(data->orig.data);
-        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
-                update_symbol_cache(data->orig.data);
-}
-static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
-{
-        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
-                free_deref_fetch_param(data->orig.data);
-        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
-                free_symbol_cache(data->orig.data);
-        kfree(data);
-}
-/* Bitfield fetch function */
-struct bitfield_fetch_param {
-        struct fetch_param      orig;
-        unsigned char           hi_shift;
-        unsigned char           low_shift;
-};
-#define DEFINE_FETCH_bitfield(type)                                     \
-static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
-                                            void *data, void *dest)     \
-{                                                                       \
-        struct bitfield_fetch_param *bprm = data;                       \
-        type buf = 0;                                                   \
-        call_fetch(&bprm->orig, regs, &buf);                            \
-        if (buf) {                                                      \
-                buf <<= bprm->hi_shift;                                 \
-                buf >>= bprm->low_shift;                                \
-        }                                                               \
-        *(type *)dest = buf;                                            \
-}
-DEFINE_BASIC_FETCH_FUNCS(bitfield)
-#define fetch_bitfield_string           NULL
-#define fetch_bitfield_string_size      NULL
-static __kprobes void
-update_bitfield_fetch_param(struct bitfield_fetch_param *data)
-{
-        /*
-         * Don't check the bitfield itself, because this must be the
-         * last fetch function.
-         */
-        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
-                update_deref_fetch_param(data->orig.data);
-        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
-                update_symbol_cache(data->orig.data);
-}
-static __kprobes void
-free_bitfield_fetch_param(struct bitfield_fetch_param *data)
-{
-        /*
-         * Don't check the bitfield itself, because this must be the
-         * last fetch function.
-         */
-        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
-                free_deref_fetch_param(data->orig.data);
-        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
-                free_symbol_cache(data->orig.data);
-        kfree(data);
-}
-/* Default (unsigned long) fetch type */
-#define __DEFAULT_FETCH_TYPE(t) u##t
-#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
-#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
-#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
-#define ASSIGN_FETCH_FUNC(method, type) \
-        [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
-#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
-        {.name = _name,                         \
-         .size = _size,                                 \
-         .is_signed = sign,                             \
-         .print = PRINT_TYPE_FUNC_NAME(ptype),          \
-         .fmt = PRINT_TYPE_FMT_NAME(ptype),             \
-         .fmttype = _fmttype,                           \
-         .fetch = {                                     \
-ASSIGN_FETCH_FUNC(reg, ftype),                          \
-ASSIGN_FETCH_FUNC(stack, ftype),                        \
-ASSIGN_FETCH_FUNC(retval, ftype),                       \
-ASSIGN_FETCH_FUNC(memory, ftype),                       \
-ASSIGN_FETCH_FUNC(symbol, ftype),                       \
-ASSIGN_FETCH_FUNC(deref, ftype),                        \
-ASSIGN_FETCH_FUNC(bitfield, ftype),                     \
-          }                                             \
-        }
-#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)                   \
-        __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
-#define FETCH_TYPE_STRING       0
-#define FETCH_TYPE_STRSIZE      1
-/* Fetch type information table */
-static const struct fetch_type fetch_type_table[] = {
-        /* Special types */
-        [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
-                                        sizeof(u32), 1, "__data_loc char[]"),
-        [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
-                                        string_size, sizeof(u32), 0, "u32"),
-        /* Basic types */
-        ASSIGN_FETCH_TYPE(u8,  u8,  0),
-        ASSIGN_FETCH_TYPE(u16, u16, 0),
-        ASSIGN_FETCH_TYPE(u32, u32, 0),
-        ASSIGN_FETCH_TYPE(u64, u64, 0),
-        ASSIGN_FETCH_TYPE(s8,  u8,  1),
-        ASSIGN_FETCH_TYPE(s16, u16, 1),
-        ASSIGN_FETCH_TYPE(s32, u32, 1),
-        ASSIGN_FETCH_TYPE(s64, u64, 1),
-};
-static const struct fetch_type *find_fetch_type(const char *type)
-{
-        int i;
-        if (!type)
-                type = DEFAULT_FETCH_TYPE_STR;
-        /* Special case: bitfield */
-        if (*type == 'b') {
-                unsigned long bs;
-                type = strchr(type, '/');
-                if (!type)
-                        goto fail;
-                type++;
-                if (kstrtoul(type, 0, &bs))
-                        goto fail;
-                switch (bs) {
-                case 8:
-                        return find_fetch_type("u8");
-                case 16:
-                        return find_fetch_type("u16");
-                case 32:
-                        return find_fetch_type("u32");
-                case 64:
-                        return find_fetch_type("u64");
-                default:
-                        goto fail;
-                }
-        }
-        for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
-                if (strcmp(type, fetch_type_table[i].name) == 0)
-                        return &fetch_type_table[i];
-fail:
-        return NULL;
-}
-/* Special function : only accept unsigned long */
-static __kprobes void fetch_stack_address(struct pt_regs *regs,
-                                        void *dummy, void *dest)
-{
-        *(unsigned long *)dest = kernel_stack_pointer(regs);
-}
-static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
-                                        fetch_func_t orig_fn)
-{
-        int i;
-        if (type != &fetch_type_table[FETCH_TYPE_STRING])
-                return NULL;    /* Only string type needs size function */
-        for (i = 0; i < FETCH_MTD_END; i++)
-                if (type->fetch[i] == orig_fn)
-                        return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
-        WARN_ON(1);     /* This should not happen */
-        return NULL;
-}
-/* Split symbol and offset. */
-int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
-{
-        char *tmp;
-        int ret;
-        if (!offset)
-                return -EINVAL;
-        tmp = strchr(symbol, '+');
-        if (tmp) {
-                /* skip sign because kstrtoul doesn't accept '+' */
-                ret = kstrtoul(tmp + 1, 0, offset);
-                if (ret)
-                        return ret;
-                *tmp = '\0';
-        } else
-                *offset = 0;
-        return 0;
-}
-#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
-static int parse_probe_vars(char *arg, const struct fetch_type *t,
-                            struct fetch_param *f, bool is_return)
-{
-        int ret = 0;
-        unsigned long param;
-        if (strcmp(arg, "retval") == 0) {
-                if (is_return)
-                        f->fn = t->fetch[FETCH_MTD_retval];
-                else
-                        ret = -EINVAL;
-        } else if (strncmp(arg, "stack", 5) == 0) {
-                if (arg[5] == '\0') {
-                        if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
-                                f->fn = fetch_stack_address;
-                        else
-                                ret = -EINVAL;
-                } else if (isdigit(arg[5])) {
-                        ret = kstrtoul(arg + 5, 10, &param);
-                        if (ret || param > PARAM_MAX_STACK)
-                                ret = -EINVAL;
-                        else {
-                                f->fn = t->fetch[FETCH_MTD_stack];
-                                f->data = (void *)param;
-                        }
-                } else
-                        ret = -EINVAL;
-        } else
-                ret = -EINVAL;
-        return ret;
-}
-/* Recursive argument parser */
-static int parse_probe_arg(char *arg, const struct fetch_type *t,
-                     struct fetch_param *f, bool is_return, bool is_kprobe)
-{
-        unsigned long param;
-        long offset;
-        char *tmp;
-        int ret;
-        ret = 0;
-        /* Until uprobe_events supports only reg arguments */
-        if (!is_kprobe && arg[0] != '%')
-                return -EINVAL;
-        switch (arg[0]) {
-        case '$':
-                ret = parse_probe_vars(arg + 1, t, f, is_return);
-                break;
-        case '%':       /* named register */
-                ret = regs_query_register_offset(arg + 1);
-                if (ret >= 0) {
-                        f->fn = t->fetch[FETCH_MTD_reg];
-                        f->data = (void *)(unsigned long)ret;
-                        ret = 0;
-                }
-                break;
-        case '@':       /* memory or symbol */
-                if (isdigit(arg[1])) {
-                        ret = kstrtoul(arg + 1, 0, &param);
-                        if (ret)
-                                break;
-                        f->fn = t->fetch[FETCH_MTD_memory];
-                        f->data = (void *)param;
-                } else {
-                        ret = traceprobe_split_symbol_offset(arg + 1, &offset);
-                        if (ret)
-                                break;
-                        f->data = alloc_symbol_cache(arg + 1, offset);
-                        if (f->data)
-                                f->fn = t->fetch[FETCH_MTD_symbol];
-                }
-                break;
-        case '+':       /* deref memory */
-                arg++;  /* Skip '+', because kstrtol() rejects it. */
-        case '-':
-                tmp = strchr(arg, '(');
-                if (!tmp)
-                        break;
-                *tmp = '\0';
-                ret = kstrtol(arg, 0, &offset);
-                if (ret)
-                        break;
-                arg = tmp + 1;
-                tmp = strrchr(arg, ')');
-                if (tmp) {
-                        struct deref_fetch_param        *dprm;
-                        const struct fetch_type         *t2;
-                        t2 = find_fetch_type(NULL);
-                        *tmp = '\0';
-                        dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL);
-                        if (!dprm)
-                                return -ENOMEM;
-                        dprm->offset = offset;
-                        ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
-                                                        is_kprobe);
-                        if (ret)
-                                kfree(dprm);
-                        else {
-                                f->fn = t->fetch[FETCH_MTD_deref];
-                                f->data = (void *)dprm;
-                        }
-                }
-                break;
-        }
-        if (!ret && !f->fn) {   /* Parsed, but do not find fetch method */
-                pr_info("%s type has no corresponding fetch method.\n", t->name);
-                ret = -EINVAL;
-        }
-        return ret;
-}
-#define BYTES_TO_BITS(nb)       ((BITS_PER_LONG * (nb)) / sizeof(long))
-/* Bitfield type needs to be parsed into a fetch function */
-static int __parse_bitfield_probe_arg(const char *bf,
-                                      const struct fetch_type *t,
-                                      struct fetch_param *f)
-{
-        struct bitfield_fetch_param *bprm;
-        unsigned long bw, bo;
-        char *tail;
-        if (*bf != 'b')
-                return 0;
-        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
-        if (!bprm)
-                return -ENOMEM;
-        bprm->orig = *f;
-        f->fn = t->fetch[FETCH_MTD_bitfield];
-        f->data = (void *)bprm;
-        bw = simple_strtoul(bf + 1, &tail, 0);  /* Use simple one */
-        if (bw == 0 || *tail != '@')
-                return -EINVAL;
-        bf = tail + 1;
-        bo = simple_strtoul(bf, &tail, 0);
-        if (tail == bf || *tail != '/')
-                return -EINVAL;
-        bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
-        bprm->low_shift = bprm->hi_shift + bo;
-        return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
-}
-/* String length checking wrapper */
-int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
-                struct probe_arg *parg, bool is_return, bool is_kprobe)
-{
-        const char *t;
-        int ret;
-        if (strlen(arg) > MAX_ARGSTR_LEN) {
-                pr_info("Argument is too long.: %s\n",  arg);
-                return -ENOSPC;
-        }
-        parg->comm = kstrdup(arg, GFP_KERNEL);
-        if (!parg->comm) {
-                pr_info("Failed to allocate memory for command '%s'.\n", arg);
-                return -ENOMEM;
-        }
-        t = strchr(parg->comm, ':');
-        if (t) {
-                arg[t - parg->comm] = '\0';
-                t++;
-        }
-        parg->type = find_fetch_type(t);
-        if (!parg->type) {
-                pr_info("Unsupported type: %s\n", t);
-                return -EINVAL;
-        }
-        parg->offset = *size;
-        *size += parg->type->size;
-        ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
-        if (ret >= 0 && t != NULL)
-                ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
-        if (ret >= 0) {
-                parg->fetch_size.fn = get_fetch_size_function(parg->type,
-                                                              parg->fetch.fn);
-                parg->fetch_size.data = parg->fetch.data;
-        }
-        return ret;
-}
-/* Return 1 if name is reserved or already used by another argument */
-int traceprobe_conflict_field_name(const char *name,
-                               struct probe_arg *args, int narg)
-{
-        int i;
-        for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
-                if (strcmp(reserved_field_names[i], name) == 0)
-                        return 1;
-        for (i = 0; i < narg; i++)
-                if (strcmp(args[i].name, name) == 0)
-                        return 1;
-        return 0;
-}
-void traceprobe_update_arg(struct probe_arg *arg)
-{
-        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
-                update_bitfield_fetch_param(arg->fetch.data);
-        else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
-                update_deref_fetch_param(arg->fetch.data);
-        else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
-                update_symbol_cache(arg->fetch.data);
-}
-void traceprobe_free_probe_arg(struct probe_arg *arg)
-{
-        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
-                free_bitfield_fetch_param(arg->fetch.data);
-        else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
-                free_deref_fetch_param(arg->fetch.data);
-        else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
-                free_symbol_cache(arg->fetch.data);
-        kfree(arg->name);
-        kfree(arg->comm);
-}
-int traceprobe_command(const char *buf, int (*createfn)(int, char **))
-{
-        char **argv;
-        int argc, ret;
-        argc = 0;
-        ret = 0;
-        argv = argv_split(GFP_KERNEL, buf, &argc);
-        if (!argv)
-                return -ENOMEM;
-        if (argc)
-                ret = createfn(argc, argv);
-        argv_free(argv);
-        return ret;
-}
-#define WRITE_BUFSIZE  4096
-ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
-                                size_t count, loff_t *ppos,
-                                int (*createfn)(int, char **))
-{
-        char *kbuf, *tmp;
-        int ret = 0;
-        size_t done = 0;
-        size_t size;
-        kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
-        if (!kbuf)
-                return -ENOMEM;
-        while (done < count) {
-                size = count - done;
-                if (size >= WRITE_BUFSIZE)
-                        size = WRITE_BUFSIZE - 1;
-                if (copy_from_user(kbuf, buffer + done, size)) {
-                        ret = -EFAULT;
-                        goto out;
-                }
-                kbuf[size] = '\0';
-                tmp = strchr(kbuf, '\n');
-                if (tmp) {
-                        *tmp = '\0';
-                        size = tmp - kbuf + 1;
-                } else if (done + size < count) {
-                        pr_warning("Line length is too long: "
-                                   "Should be less than %d.", WRITE_BUFSIZE);
-                        ret = -EINVAL;
-                        goto out;
-                }
-                done += size;
-                /* Remove comments */
-                tmp = strchr(kbuf, '#');
-                if (tmp)
-                        *tmp = '\0';
-                ret = traceprobe_command(kbuf, createfn);
-                if (ret)
-                        goto out;
-        }
-        ret = done;
-out:
-        kfree(kbuf);
-        return ret;
-}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
deleted file mode 100644
index 93370867781..00000000000
--- a/kernel/trace/trace_probe.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Common header file for probe-based Dynamic events.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- * This code was copied from kernel/trace/trace_kprobe.h written by
- * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
- *
- * Updates to make this generic:
- * Copyright (C) IBM Corporation, 2010-2011
- * Author:     Srikar Dronamraju
- */
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-#include <linux/debugfs.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/ptrace.h>
-#include <linux/perf_event.h>
-#include <linux/kprobes.h>
-#include <linux/stringify.h>
-#include <linux/limits.h>
-#include <linux/uaccess.h>
-#include <asm/bitsperlong.h>
-#include "trace.h"
-#include "trace_output.h"
-#define MAX_TRACE_ARGS          128
-#define MAX_ARGSTR_LEN          63
-#define MAX_EVENT_NAME_LEN      64
-#define MAX_STRING_SIZE         PATH_MAX
-/* Reserved field names */
-#define FIELD_STRING_IP         "__probe_ip"
-#define FIELD_STRING_RETIP      "__probe_ret_ip"
-#define FIELD_STRING_FUNC       "__probe_func"
-#undef DEFINE_FIELD
-#define DEFINE_FIELD(type, item, name, is_signed)                       \
-        do {                                                            \
-                ret = trace_define_field(event_call, #type, name,       \
-                                         offsetof(typeof(field), item), \
-                                         sizeof(field.item), is_signed, \
-                                         FILTER_OTHER);                 \
-                if (ret)                                                \
-                        return ret;                                     \
-        } while (0)
-/* Flags for trace_probe */
-#define TP_FLAG_TRACE           1
-#define TP_FLAG_PROFILE         2
-#define TP_FLAG_REGISTERED      4
-#define TP_FLAG_UPROBE          8
-/* data_rloc: data relative location, compatible with u32 */
-#define make_data_rloc(len, roffs)      \
-        (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
-#define get_rloc_len(dl)                ((u32)(dl) >> 16)
-#define get_rloc_offs(dl)               ((u32)(dl) & 0xffff)
-/*
- * Convert data_rloc to data_loc:
- *  data_rloc stores the offset from data_rloc itself, but data_loc
- *  stores the offset from event entry.
- */
-#define convert_rloc_to_loc(dl, offs)   ((u32)(dl) + (offs))
-/* Data fetch function type */
-typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
-/* Printing function type */
-typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *);
-/* Fetch types */
-enum {
-        FETCH_MTD_reg = 0,
-        FETCH_MTD_stack,
-        FETCH_MTD_retval,
-        FETCH_MTD_memory,
-        FETCH_MTD_symbol,
-        FETCH_MTD_deref,
-        FETCH_MTD_bitfield,
-        FETCH_MTD_END,
-};
-/* Fetch type information table */
-struct fetch_type {
-        const char              *name;          /* Name of type */
-        size_t                  size;           /* Byte size of type */
-        int                     is_signed;      /* Signed flag */
-        print_type_func_t       print;          /* Print functions */
-        const char              *fmt;           /* Fromat string */
-        const char              *fmttype;       /* Name in format file */
-        /* Fetch functions */
-        fetch_func_t            fetch[FETCH_MTD_END];
-};
-struct fetch_param {
-        fetch_func_t            fn;
-        void                    *data;
-};
-struct probe_arg {
-        struct fetch_param      fetch;
-        struct fetch_param      fetch_size;
-        unsigned int            offset; /* Offset from argument entry */
-        const char              *name;  /* Name of this argument */
-        const char              *comm;  /* Command of this argument */
-        const struct fetch_type *type;  /* Type of this argument */
-};
-static inline __kprobes void call_fetch(struct fetch_param *fprm,
-                                 struct pt_regs *regs, void *dest)
-{
-        return fprm->fn(regs, fprm->data, dest);
-}
-/* Check the name is good for event/group/fields */
-static inline int is_good_name(const char *name)
-{
-        if (!isalpha(*name) && *name != '_')
-                return 0;
-        while (*++name != '\0') {
-                if (!isalpha(*name) && !isdigit(*name) && *name != '_')
-                        return 0;
-        }
-        return 1;
-}
-extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
-                   struct probe_arg *parg, bool is_return, bool is_kprobe);
-extern int traceprobe_conflict_field_name(const char *name,
-                               struct probe_arg *args, int narg);
-extern void traceprobe_update_arg(struct probe_arg *arg);
-extern void traceprobe_free_probe_arg(struct probe_arg *arg);
-extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
-extern ssize_t traceprobe_probes_write(struct file *file,
-                const char __user *buffer, size_t count, loff_t *ppos,
-                int (*createfn)(int, char**));
-extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3374c792ccd..7e62c0a1845 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -102,7 +102,9 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
        entry->next_cpu                 = task_cpu(wakee);
        if (!filter_check_discard(call, entry, buffer, event))
-                trace_buffer_unlock_commit(buffer, event, flags, pc);
+                ring_buffer_unlock_commit(buffer, event);
+        ftrace_trace_stack(tr->buffer, flags, 6, pc);
+        ftrace_trace_userstack(tr->buffer, flags, pc);
 }
 static void
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9fe45fcefca..e4a70c0c71b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -7,7 +7,7 @@
 * Based on code from the latency_tracer, that is:
 *
 *  Copyright (C) 2004-2006 Ingo Molnar
- *  Copyright (C) 2004 Nadia Yvette Chambers
+ *  Copyright (C) 2004 William Lee Irwin III
 */
 #include <linux/module.h>
 #include <linux/fs.h>
@@ -108,8 +108,7 @@ out_enable:
 * wakeup uses its own tracer function to keep the overhead down:
 */
 static void
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
-                   struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
        struct trace_array *tr = wakeup_trace;
        struct trace_array_cpu *data;
@@ -130,7 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
 static struct ftrace_ops trace_ops __read_mostly =
 {
        .func = wakeup_tracer_call,
-        .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 #endif /* CONFIG_FUNCTION_TRACER */
@@ -281,20 +280,9 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
 }
 static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
+static void wakeup_print_header(struct seq_file *s) { }
 static void wakeup_trace_open(struct trace_iterator *iter) { }
 static void wakeup_trace_close(struct trace_iterator *iter) { }
-#ifdef CONFIG_FUNCTION_TRACER
-static void wakeup_print_header(struct seq_file *s)
-{
-        trace_default_header(s);
-}
-#else
-static void wakeup_print_header(struct seq_file *s)
-{
-        trace_latency_header(s);
-}
-#endif /* CONFIG_FUNCTION_TRACER */
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 /*
@@ -589,7 +577,7 @@ static struct tracer wakeup_tracer __read_mostly =
        .reset          = wakeup_tracer_reset,
        .start          = wakeup_tracer_start,
        .stop           = wakeup_tracer_stop,
-        .print_max      = true,
+        .print_max      = 1,
        .print_header   = wakeup_print_header,
        .print_line     = wakeup_print_line,
        .flags          = &tracer_flags,
@@ -599,7 +587,7 @@ static struct tracer wakeup_tracer __read_mostly =
 #endif
        .open           = wakeup_trace_open,
        .close          = wakeup_trace_close,
-        .use_max_tr     = true,
+        .use_max_tr     = 1,
 };
 static struct tracer wakeup_rt_tracer __read_mostly =
@@ -610,7 +598,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
        .start          = wakeup_tracer_start,
        .stop           = wakeup_tracer_stop,
        .wait_pipe      = poll_wait_pipe,
-        .print_max      = true,
+        .print_max      = 1,
        .print_header   = wakeup_print_header,
        .print_line     = wakeup_print_line,
        .flags          = &tracer_flags,
@@ -620,7 +608,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
 #endif
        .open           = wakeup_trace_open,
        .close          = wakeup_trace_close,
-        .use_max_tr     = true,
+        .use_max_tr     = 1,
 };
 __init static int init_wakeup_tracer(void)
@@ -637,4 +625,4 @@ __init static int init_wakeup_tracer(void)
        return 0;
 }
-core_initcall(init_wakeup_tracer);
+device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 47623169a81..288541f977f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -103,67 +103,54 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
 static int trace_selftest_test_probe1_cnt;
 static void trace_selftest_test_probe1_func(unsigned long ip,
-                                            unsigned long pip,
+                                            unsigned long pip)
-                                            struct ftrace_ops *op,
-                                            struct pt_regs *pt_regs)
 {
        trace_selftest_test_probe1_cnt++;
 }
 static int trace_selftest_test_probe2_cnt;
 static void trace_selftest_test_probe2_func(unsigned long ip,
-                                            unsigned long pip,
+                                            unsigned long pip)
-                                            struct ftrace_ops *op,
-                                            struct pt_regs *pt_regs)
 {
        trace_selftest_test_probe2_cnt++;
 }
 static int trace_selftest_test_probe3_cnt;
 static void trace_selftest_test_probe3_func(unsigned long ip,
-                                            unsigned long pip,
+                                            unsigned long pip)
-                                            struct ftrace_ops *op,
-                                            struct pt_regs *pt_regs)
 {
        trace_selftest_test_probe3_cnt++;
 }
 static int trace_selftest_test_global_cnt;
 static void trace_selftest_test_global_func(unsigned long ip,
-                                            unsigned long pip,
+                                            unsigned long pip)
-                                            struct ftrace_ops *op,
-                                            struct pt_regs *pt_regs)
 {
        trace_selftest_test_global_cnt++;
 }
 static int trace_selftest_test_dyn_cnt;
 static void trace_selftest_test_dyn_func(unsigned long ip,
-                                         unsigned long pip,
+                                         unsigned long pip)
-                                         struct ftrace_ops *op,
-                                         struct pt_regs *pt_regs)
 {
        trace_selftest_test_dyn_cnt++;
 }
 static struct ftrace_ops test_probe1 = {
        .func                   = trace_selftest_test_probe1_func,
-        .flags                  = FTRACE_OPS_FL_RECURSION_SAFE,
 };
 static struct ftrace_ops test_probe2 = {
        .func                   = trace_selftest_test_probe2_func,
-        .flags                  = FTRACE_OPS_FL_RECURSION_SAFE,
 };
 static struct ftrace_ops test_probe3 = {
        .func                   = trace_selftest_test_probe3_func,
-        .flags                  = FTRACE_OPS_FL_RECURSION_SAFE,
 };
 static struct ftrace_ops test_global = {
-        .func           = trace_selftest_test_global_func,
+        .func                   = trace_selftest_test_global_func,
-        .flags          = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
+        .flags                  = FTRACE_OPS_FL_GLOBAL,
 };
 static void print_counts(void)
@@ -320,6 +307,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
                                           int (*func)(void))
 {
        int save_ftrace_enabled = ftrace_enabled;
+        int save_tracer_enabled = tracer_enabled;
        unsigned long count;
        char *func_name;
        int ret;
@@ -330,6 +318,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        /* enable tracing, and record the filter function */
        ftrace_enabled = 1;
+        tracer_enabled = 1;
        /* passed in by parameter to fool gcc from optimizing */
        func();
@@ -393,6 +382,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 out:
        ftrace_enabled = save_ftrace_enabled;
+        tracer_enabled = save_tracer_enabled;
        /* Enable tracing on all functions again */
        ftrace_set_global_filter(NULL, 0, 1);
@@ -403,247 +393,10 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        return ret;
 }
-static int trace_selftest_recursion_cnt;
-static void trace_selftest_test_recursion_func(unsigned long ip,
-                                               unsigned long pip,
-                                               struct ftrace_ops *op,
-                                               struct pt_regs *pt_regs)
-{
-        /*
-         * This function is registered without the recursion safe flag.
-         * The ftrace infrastructure should provide the recursion
-         * protection. If not, this will crash the kernel!
-         */
-        trace_selftest_recursion_cnt++;
-        DYN_FTRACE_TEST_NAME();
-}
-static void trace_selftest_test_recursion_safe_func(unsigned long ip,
-                                                    unsigned long pip,
-                                                    struct ftrace_ops *op,
-                                                    struct pt_regs *pt_regs)
-{
-        /*
-         * We said we would provide our own recursion. By calling
-         * this function again, we should recurse back into this function
-         * and count again. But this only happens if the arch supports
-         * all of ftrace features and nothing else is using the function
-         * tracing utility.
-         */
-        if (trace_selftest_recursion_cnt++)
-                return;
-        DYN_FTRACE_TEST_NAME();
-}
-static struct ftrace_ops test_rec_probe = {
-        .func                   = trace_selftest_test_recursion_func,
-};
-static struct ftrace_ops test_recsafe_probe = {
-        .func                   = trace_selftest_test_recursion_safe_func,
-        .flags                  = FTRACE_OPS_FL_RECURSION_SAFE,
-};
-static int
-trace_selftest_function_recursion(void)
-{
-        int save_ftrace_enabled = ftrace_enabled;
-        char *func_name;
-        int len;
-        int ret;
-        int cnt;
-        /* The previous test PASSED */
-        pr_cont("PASSED\n");
-        pr_info("Testing ftrace recursion: ");
-        /* enable tracing, and record the filter function */
-        ftrace_enabled = 1;
-        /* Handle PPC64 '.' name */
-        func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
-        len = strlen(func_name);
-        ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1);
-        if (ret) {
-                pr_cont("*Could not set filter* ");
-                goto out;
-        }
-        ret = register_ftrace_function(&test_rec_probe);
-        if (ret) {
-                pr_cont("*could not register callback* ");
-                goto out;
-        }
-        DYN_FTRACE_TEST_NAME();
-        unregister_ftrace_function(&test_rec_probe);
-        ret = -1;
-        if (trace_selftest_recursion_cnt != 1) {
-                pr_cont("*callback not called once (%d)* ",
-                        trace_selftest_recursion_cnt);
-                goto out;
-        }
-        trace_selftest_recursion_cnt = 1;
-        pr_cont("PASSED\n");
-        pr_info("Testing ftrace recursion safe: ");
-        ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1);
-        if (ret) {
-                pr_cont("*Could not set filter* ");
-                goto out;
-        }
-        ret = register_ftrace_function(&test_recsafe_probe);
-        if (ret) {
-                pr_cont("*could not register callback* ");
-                goto out;
-        }
-        DYN_FTRACE_TEST_NAME();
-        unregister_ftrace_function(&test_recsafe_probe);
-        /*
-         * If arch supports all ftrace features, and no other task
-         * was on the list, we should be fine.
-         */
-        if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
-                cnt = 2; /* Should have recursed */
-        else
-                cnt = 1;
-        ret = -1;
-        if (trace_selftest_recursion_cnt != cnt) {
-                pr_cont("*callback not called expected %d times (%d)* ",
-                        cnt, trace_selftest_recursion_cnt);
-                goto out;
-        }
-        ret = 0;
-out:
-        ftrace_enabled = save_ftrace_enabled;
-        return ret;
-}
 #else
 # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
-# define trace_selftest_function_recursion() ({ 0; })
 #endif /* CONFIG_DYNAMIC_FTRACE */
-static enum {
-        TRACE_SELFTEST_REGS_START,
-        TRACE_SELFTEST_REGS_FOUND,
-        TRACE_SELFTEST_REGS_NOT_FOUND,
-} trace_selftest_regs_stat;
-static void trace_selftest_test_regs_func(unsigned long ip,
-                                          unsigned long pip,
-                                          struct ftrace_ops *op,
-                                          struct pt_regs *pt_regs)
-{
-        if (pt_regs)
-                trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND;
-        else
-                trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND;
-}
-static struct ftrace_ops test_regs_probe = {
-        .func           = trace_selftest_test_regs_func,
-        .flags          = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS,
-};
-static int
-trace_selftest_function_regs(void)
-{
-        int save_ftrace_enabled = ftrace_enabled;
-        char *func_name;
-        int len;
-        int ret;
-        int supported = 0;
-#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
-        supported = 1;
-#endif
-        /* The previous test PASSED */
-        pr_cont("PASSED\n");
-        pr_info("Testing ftrace regs%s: ",
-                !supported ? "(no arch support)" : "");
-        /* enable tracing, and record the filter function */
-        ftrace_enabled = 1;
-        /* Handle PPC64 '.' name */
-        func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
-        len = strlen(func_name);
-        ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1);
-        /*
-         * If DYNAMIC_FTRACE is not set, then we just trace all functions.
-         * This test really doesn't care.
-         */
-        if (ret && ret != -ENODEV) {
-                pr_cont("*Could not set filter* ");
-                goto out;
-        }
-        ret = register_ftrace_function(&test_regs_probe);
-        /*
-         * Now if the arch does not support passing regs, then this should
-         * have failed.
-         */
-        if (!supported) {
-                if (!ret) {
-                        pr_cont("*registered save-regs without arch support* ");
-                        goto out;
-                }
-                test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED;
-                ret = register_ftrace_function(&test_regs_probe);
-        }
-        if (ret) {
-                pr_cont("*could not register callback* ");
-                goto out;
-        }
-        DYN_FTRACE_TEST_NAME();
-        unregister_ftrace_function(&test_regs_probe);
-        ret = -1;
-        switch (trace_selftest_regs_stat) {
-        case TRACE_SELFTEST_REGS_START:
-                pr_cont("*callback never called* ");
-                goto out;
-        case TRACE_SELFTEST_REGS_FOUND:
-                if (supported)
-                        break;
-                pr_cont("*callback received regs without arch support* ");
-                goto out;
-        case TRACE_SELFTEST_REGS_NOT_FOUND:
-                if (!supported)
-                        break;
-                pr_cont("*callback received NULL regs* ");
-                goto out;
-        }
-        ret = 0;
-out:
-        ftrace_enabled = save_ftrace_enabled;
-        return ret;
-}
 /*
 * Simple verification test of ftrace function tracer.
 * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -653,6 +406,7 @@ int
 trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 {
        int save_ftrace_enabled = ftrace_enabled;
+        int save_tracer_enabled = tracer_enabled;
        unsigned long count;
        int ret;
@@ -661,6 +415,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
        /* start the tracing */
        ftrace_enabled = 1;
+        tracer_enabled = 1;
        ret = tracer_init(trace, tr);
        if (ret) {
@@ -687,16 +442,10 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
        ret = trace_selftest_startup_dynamic_tracing(trace, tr,
                                                     DYN_FTRACE_TEST_NAME);
-        if (ret)
-                goto out;
-        ret = trace_selftest_function_recursion();
-        if (ret)
-                goto out;
-        ret = trace_selftest_function_regs();
 out:
        ftrace_enabled = save_ftrace_enabled;
+        tracer_enabled = save_tracer_enabled;
        /* kill ftrace totally if we failed */
        if (ret)
@@ -1029,8 +778,6 @@ static int trace_wakeup_test_thread(void *data)
        set_current_state(TASK_INTERRUPTIBLE);
        schedule();
-        complete(x);
        /* we are awake, now wait to disappear */
        while (!kthread_should_stop()) {
                /*
@@ -1074,27 +821,29 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
        /* reset the max latency */
        tracing_max_latency = 0;
-        while (p->on_rq) {
+        /* sleep to let the RT thread sleep too */
-                /*
+        msleep(100);
-                 * Sleep to make sure the RT thread is asleep too.
-                 * On virtual machines we can't rely on timings,
-                 * but we want to make sure this test still works.
-                 */
-                msleep(100);
-        }
-        init_completion(&isrt);
+        /*
+         * Yes this is slightly racy. It is possible that for some
+         * strange reason that the RT thread we created, did not
+         * call schedule for 100ms after doing the completion,
+         * and we do a wakeup on a task that already is awake.
+         * But that is extremely unlikely, and the worst thing that
+         * happens in such a case, is that we disable tracing.
+         * Honestly, if this race does happen something is horrible
+         * wrong with the system.
+         */
        wake_up_process(p);
-        /* Wait for the task to wake up */
+        /* give a little time to let the thread wake up */
-        wait_for_completion(&isrt);
+        msleep(100);
        /* stop the tracing. */
        tracing_stop();
        /* check both trace buffers */
        ret = trace_test_buffer(tr, NULL);
-        printk("ret = %d\n", ret);
        if (!ret)
                ret = trace_test_buffer(&max_tr, &count);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 42ca822fc70..77575b386d9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,9 +13,6 @@
 #include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/fs.h>
-#include <asm/setup.h>
 #include "trace.h"
 #define STACK_TRACE_ENTRIES 500
@@ -33,6 +30,7 @@ static unsigned long max_stack_size;
 static arch_spinlock_t max_stack_lock =
        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static int stack_trace_disabled __read_mostly;
 static DEFINE_PER_CPU(int, trace_active);
 static DEFINE_MUTEX(stack_sysctl_mutex);
@@ -110,11 +108,13 @@ static inline void check_stack(void)
 }
 static void
-stack_trace_call(unsigned long ip, unsigned long parent_ip,
+stack_trace_call(unsigned long ip, unsigned long parent_ip)
-                 struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
        int cpu;
+        if (unlikely(!ftrace_enabled || stack_trace_disabled))
+                return;
        preempt_disable_notrace();
        cpu = raw_smp_processor_id();
@@ -133,7 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
 static struct ftrace_ops trace_ops __read_mostly =
 {
        .func = stack_trace_call,
-        .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 static ssize_t
@@ -311,21 +311,6 @@ static const struct file_operations stack_trace_fops = {
        .release        = seq_release,
 };
-static int
-stack_trace_filter_open(struct inode *inode, struct file *file)
-{
-        return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER,
-                                 inode, file);
-}
-static const struct file_operations stack_trace_filter_fops = {
-        .open = stack_trace_filter_open,
-        .read = seq_read,
-        .write = ftrace_filter_write,
-        .llseek = ftrace_regex_lseek,
-        .release = ftrace_regex_release,
-};
 int
 stack_trace_sysctl(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp,
@@ -353,13 +338,8 @@ stack_trace_sysctl(struct ctl_table *table, int write,
        return ret;
 }
-static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
 static __init int enable_stacktrace(char *str)
 {
-        if (strncmp(str, "_filter=", 8) == 0)
-                strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
        stack_tracer_enabled = 1;
        last_stack_tracer_enabled = 1;
        return 1;
@@ -378,12 +358,6 @@ static __init int stack_trace_init(void)
        trace_create_file("stack_trace", 0444, d_tracer,
                        NULL, &stack_trace_fops);
-        trace_create_file("stack_trace_filter", 0444, d_tracer,
-                        NULL, &stack_trace_filter_fops);
-        if (stack_trace_filter_buf[0])
-                ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
        if (stack_tracer_enabled)
                register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7609dd6714c..ee7b5a0bb9f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,7 +2,6 @@
 #include <trace/events/syscalls.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
-#include <linux/module.h>       /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
 #include <asm/syscall.h>
@@ -17,9 +16,12 @@ static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
 static int syscall_enter_register(struct ftrace_event_call *event,
-                                 enum trace_reg type, void *data);
+                                 enum trace_reg type);
 static int syscall_exit_register(struct ftrace_event_call *event,
-                                 enum trace_reg type, void *data);
+                                 enum trace_reg type);
+static int syscall_enter_define_fields(struct ftrace_event_call *call);
+static int syscall_exit_define_fields(struct ftrace_event_call *call);
 static struct list_head *
 syscall_get_enter_fields(struct ftrace_event_call *call)
@@ -29,6 +31,30 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
        return &entry->enter_fields;
 }
+struct trace_event_functions enter_syscall_print_funcs = {
+        .trace          = print_syscall_enter,
+};
+struct trace_event_functions exit_syscall_print_funcs = {
+        .trace          = print_syscall_exit,
+};
+struct ftrace_event_class event_class_syscall_enter = {
+        .system         = "syscalls",
+        .reg            = syscall_enter_register,
+        .define_fields  = syscall_enter_define_fields,
+        .get_fields     = syscall_get_enter_fields,
+        .raw_init       = init_syscall_trace,
+};
+struct ftrace_event_class event_class_syscall_exit = {
+        .system         = "syscalls",
+        .reg            = syscall_exit_register,
+        .define_fields  = syscall_exit_define_fields,
+        .fields         = LIST_HEAD_INIT(event_class_syscall_exit.fields),
+        .raw_init       = init_syscall_trace,
+};
 extern struct syscall_metadata *__start_syscalls_metadata[];
 extern struct syscall_metadata *__stop_syscalls_metadata[];
@@ -405,7 +431,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
        mutex_unlock(&syscall_trace_lock);
 }
-static int init_syscall_trace(struct ftrace_event_call *call)
+int init_syscall_trace(struct ftrace_event_call *call)
 {
        int id;
        int num;
@@ -430,30 +456,6 @@ static int init_syscall_trace(struct ftrace_event_call *call)
        return id;
 }
-struct trace_event_functions enter_syscall_print_funcs = {
-        .trace          = print_syscall_enter,
-};
-struct trace_event_functions exit_syscall_print_funcs = {
-        .trace          = print_syscall_exit,
-};
-struct ftrace_event_class event_class_syscall_enter = {
-        .system         = "syscalls",
-        .reg            = syscall_enter_register,
-        .define_fields  = syscall_enter_define_fields,
-        .get_fields     = syscall_get_enter_fields,
-        .raw_init       = init_syscall_trace,
-};
-struct ftrace_event_class event_class_syscall_exit = {
-        .system         = "syscalls",
-        .reg            = syscall_exit_register,
-        .define_fields  = syscall_exit_define_fields,
-        .fields         = LIST_HEAD_INIT(event_class_syscall_exit.fields),
-        .raw_init       = init_syscall_trace,
-};
 unsigned long __init __weak arch_syscall_addr(int nr)
 {
        return (unsigned long)sys_call_table[nr];
@@ -465,8 +467,8 @@ int __init init_ftrace_syscalls(void)
        unsigned long addr;
        int i;
-        syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata),
+        syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
-                                    GFP_KERNEL);
+                                        NR_syscalls, GFP_KERNEL);
        if (!syscalls_metadata) {
                WARN_ON(1);
                return -ENOMEM;
@@ -484,7 +486,7 @@ int __init init_ftrace_syscalls(void)
        return 0;
 }
-early_initcall(init_ftrace_syscalls);
+core_initcall(init_ftrace_syscalls);
 #ifdef CONFIG_PERF_EVENTS
@@ -503,8 +505,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        int size;
        syscall_nr = syscall_get_nr(current, regs);
-        if (syscall_nr < 0)
-                return;
        if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
                return;
@@ -531,10 +531,10 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
                               (unsigned long *)&rec->args);
        head = this_cpu_ptr(sys_data->enter_event->perf_events);
-        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
 }
-static int perf_sysenter_enable(struct ftrace_event_call *call)
+int perf_sysenter_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -555,7 +555,7 @@ static int perf_sysenter_enable(struct ftrace_event_call *call)
        return ret;
 }
-static void perf_sysenter_disable(struct ftrace_event_call *call)
+void perf_sysenter_disable(struct ftrace_event_call *call)
 {
        int num;
@@ -579,8 +579,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        int size;
        syscall_nr = syscall_get_nr(current, regs);
-        if (syscall_nr < 0)
-                return;
        if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
                return;
@@ -609,10 +607,10 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        rec->ret = syscall_get_return_value(current, regs);
        head = this_cpu_ptr(sys_data->exit_event->perf_events);
-        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
 }
-static int perf_sysexit_enable(struct ftrace_event_call *call)
+int perf_sysexit_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -633,7 +631,7 @@ static int perf_sysexit_enable(struct ftrace_event_call *call)
        return ret;
 }
-static void perf_sysexit_disable(struct ftrace_event_call *call)
+void perf_sysexit_disable(struct ftrace_event_call *call)
 {
        int num;
@@ -650,7 +648,7 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
 #endif /* CONFIG_PERF_EVENTS */
 static int syscall_enter_register(struct ftrace_event_call *event,
-                                 enum trace_reg type, void *data)
+                                 enum trace_reg type)
 {
        switch (type) {
        case TRACE_REG_REGISTER:
@@ -665,18 +663,13 @@ static int syscall_enter_register(struct ftrace_event_call *event,
        case TRACE_REG_PERF_UNREGISTER:
                perf_sysenter_disable(event);
                return 0;
-        case TRACE_REG_PERF_OPEN:
-        case TRACE_REG_PERF_CLOSE:
-        case TRACE_REG_PERF_ADD:
-        case TRACE_REG_PERF_DEL:
-                return 0;
 #endif
        }
        return 0;
 }
 static int syscall_exit_register(struct ftrace_event_call *event,
-                                 enum trace_reg type, void *data)
+                                 enum trace_reg type)
 {
        switch (type) {
        case TRACE_REG_REGISTER:
@@ -691,11 +684,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
        case TRACE_REG_PERF_UNREGISTER:
                perf_sysexit_disable(event);
                return 0;
-        case TRACE_REG_PERF_OPEN:
-        case TRACE_REG_PERF_CLOSE:
-        case TRACE_REG_PERF_ADD:
-        case TRACE_REG_PERF_DEL:
-                return 0;
 #endif
        }
        return 0;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
deleted file mode 100644
index c86e6d4f67f..00000000000
--- a/kernel/trace/trace_uprobe.c
+++ /dev/null
@@ -1,788 +0,0 @@
-/*
- * uprobes-based tracing events
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- * Copyright (C) IBM Corporation, 2010-2012
- * Author:      Srikar Dronamraju <srikar@linux.vnet.ibm.com>
- */
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <linux/uprobes.h>
-#include <linux/namei.h>
-#include <linux/string.h>
-#include "trace_probe.h"
-#define UPROBE_EVENT_SYSTEM     "uprobes"
-/*
- * uprobe event core functions
- */
-struct trace_uprobe;
-struct uprobe_trace_consumer {
-        struct uprobe_consumer          cons;
-        struct trace_uprobe             *tu;
-};
-struct trace_uprobe {
-        struct list_head                list;
-        struct ftrace_event_class       class;
-        struct ftrace_event_call        call;
-        struct uprobe_trace_consumer    *consumer;
-        struct inode                    *inode;
-        char                            *filename;
-        unsigned long                   offset;
-        unsigned long                   nhit;
-        unsigned int                    flags;  /* For TP_FLAG_* */
-        ssize_t                         size;   /* trace entry size */
-        unsigned int                    nr_args;
-        struct probe_arg                args[];
-};
-#define SIZEOF_TRACE_UPROBE(n)                  \
-        (offsetof(struct trace_uprobe, args) +  \
-        (sizeof(struct probe_arg) * (n)))
-static int register_uprobe_event(struct trace_uprobe *tu);
-static void unregister_uprobe_event(struct trace_uprobe *tu);
-static DEFINE_MUTEX(uprobe_lock);
-static LIST_HEAD(uprobe_list);
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
-/*
- * Allocate new trace_uprobe and initialize it (including uprobes).
- */
-static struct trace_uprobe *
-alloc_trace_uprobe(const char *group, const char *event, int nargs)
-{
-        struct trace_uprobe *tu;
-        if (!event || !is_good_name(event))
-                return ERR_PTR(-EINVAL);
-        if (!group || !is_good_name(group))
-                return ERR_PTR(-EINVAL);
-        tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
-        if (!tu)
-                return ERR_PTR(-ENOMEM);
-        tu->call.class = &tu->class;
-        tu->call.name = kstrdup(event, GFP_KERNEL);
-        if (!tu->call.name)
-                goto error;
-        tu->class.system = kstrdup(group, GFP_KERNEL);
-        if (!tu->class.system)
-                goto error;
-        INIT_LIST_HEAD(&tu->list);
-        return tu;
-error:
-        kfree(tu->call.name);
-        kfree(tu);
-        return ERR_PTR(-ENOMEM);
-}
-static void free_trace_uprobe(struct trace_uprobe *tu)
-{
-        int i;
-        for (i = 0; i < tu->nr_args; i++)
-                traceprobe_free_probe_arg(&tu->args[i]);
-        iput(tu->inode);
-        kfree(tu->call.class->system);
-        kfree(tu->call.name);
-        kfree(tu->filename);
-        kfree(tu);
-}
-static struct trace_uprobe *find_probe_event(const char *event, const char *group)
-{
-        struct trace_uprobe *tu;
-        list_for_each_entry(tu, &uprobe_list, list)
-                if (strcmp(tu->call.name, event) == 0 &&
-                    strcmp(tu->call.class->system, group) == 0)
-                        return tu;
-        return NULL;
-}
-/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
-static void unregister_trace_uprobe(struct trace_uprobe *tu)
-{
-        list_del(&tu->list);
-        unregister_uprobe_event(tu);
-        free_trace_uprobe(tu);
-}
-/* Register a trace_uprobe and probe_event */
-static int register_trace_uprobe(struct trace_uprobe *tu)
-{
-        struct trace_uprobe *old_tp;
-        int ret;
-        mutex_lock(&uprobe_lock);
-        /* register as an event */
-        old_tp = find_probe_event(tu->call.name, tu->call.class->system);
-        if (old_tp)
-                /* delete old event */
-                unregister_trace_uprobe(old_tp);
-        ret = register_uprobe_event(tu);
-        if (ret) {
-                pr_warning("Failed to register probe event(%d)\n", ret);
-                goto end;
-        }
-        list_add_tail(&tu->list, &uprobe_list);
-end:
-        mutex_unlock(&uprobe_lock);
-        return ret;
-}
-/*
- * Argument syntax:
- *  - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS]
- *
- *  - Remove uprobe: -:[GRP/]EVENT
- */
-static int create_trace_uprobe(int argc, char **argv)
-{
-        struct trace_uprobe *tu;
-        struct inode *inode;
-        char *arg, *event, *group, *filename;
-        char buf[MAX_EVENT_NAME_LEN];
-        struct path path;
-        unsigned long offset;
-        bool is_delete;
-        int i, ret;
-        inode = NULL;
-        ret = 0;
-        is_delete = false;
-        event = NULL;
-        group = NULL;
-        /* argc must be >= 1 */
-        if (argv[0][0] == '-')
-                is_delete = true;
-        else if (argv[0][0] != 'p') {
-                pr_info("Probe definition must be started with 'p' or '-'.\n");
-                return -EINVAL;
-        }
-        if (argv[0][1] == ':') {
-                event = &argv[0][2];
-                arg = strchr(event, '/');
-                if (arg) {
-                        group = event;
-                        event = arg + 1;
-                        event[-1] = '\0';
-                        if (strlen(group) == 0) {
-                                pr_info("Group name is not specified\n");
-                                return -EINVAL;
-                        }
-                }
-                if (strlen(event) == 0) {
-                        pr_info("Event name is not specified\n");
-                        return -EINVAL;
-                }
-        }
-        if (!group)
-                group = UPROBE_EVENT_SYSTEM;
-        if (is_delete) {
-                if (!event) {
-                        pr_info("Delete command needs an event name.\n");
-                        return -EINVAL;
-                }
-                mutex_lock(&uprobe_lock);
-                tu = find_probe_event(event, group);
-                if (!tu) {
-                        mutex_unlock(&uprobe_lock);
-                        pr_info("Event %s/%s doesn't exist.\n", group, event);
-                        return -ENOENT;
-                }
-                /* delete an event */
-                unregister_trace_uprobe(tu);
-                mutex_unlock(&uprobe_lock);
-                return 0;
-        }
-        if (argc < 2) {
-                pr_info("Probe point is not specified.\n");
-                return -EINVAL;
-        }
-        if (isdigit(argv[1][0])) {
-                pr_info("probe point must be have a filename.\n");
-                return -EINVAL;
-        }
-        arg = strchr(argv[1], ':');
-        if (!arg)
-                goto fail_address_parse;
-        *arg++ = '\0';
-        filename = argv[1];
-        ret = kern_path(filename, LOOKUP_FOLLOW, &path);
-        if (ret)
-                goto fail_address_parse;
-        ret = kstrtoul(arg, 0, &offset);
-        if (ret)
-                goto fail_address_parse;
-        inode = igrab(path.dentry->d_inode);
-        argc -= 2;
-        argv += 2;
-        /* setup a probe */
-        if (!event) {
-                char *tail;
-                char *ptr;
-                tail = kstrdup(kbasename(filename), GFP_KERNEL);
-                if (!tail) {
-                        ret = -ENOMEM;
-                        goto fail_address_parse;
-                }
-                ptr = strpbrk(tail, ".-_");
-                if (ptr)
-                        *ptr = '\0';
-                snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
-                event = buf;
-                kfree(tail);
-        }
-        tu = alloc_trace_uprobe(group, event, argc);
-        if (IS_ERR(tu)) {
-                pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
-                ret = PTR_ERR(tu);
-                goto fail_address_parse;
-        }
-        tu->offset = offset;
-        tu->inode = inode;
-        tu->filename = kstrdup(filename, GFP_KERNEL);
-        if (!tu->filename) {
-                pr_info("Failed to allocate filename.\n");
-                ret = -ENOMEM;
-                goto error;
-        }
-        /* parse arguments */
-        ret = 0;
-        for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
-                /* Increment count for freeing args in error case */
-                tu->nr_args++;
-                /* Parse argument name */
-                arg = strchr(argv[i], '=');
-                if (arg) {
-                        *arg++ = '\0';
-                        tu->args[i].name = kstrdup(argv[i], GFP_KERNEL);
-                } else {
-                        arg = argv[i];
-                        /* If argument name is omitted, set "argN" */
-                        snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
-                        tu->args[i].name = kstrdup(buf, GFP_KERNEL);
-                }
-                if (!tu->args[i].name) {
-                        pr_info("Failed to allocate argument[%d] name.\n", i);
-                        ret = -ENOMEM;
-                        goto error;
-                }
-                if (!is_good_name(tu->args[i].name)) {
-                        pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name);
-                        ret = -EINVAL;
-                        goto error;
-                }
-                if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) {
-                        pr_info("Argument[%d] name '%s' conflicts with "
-                                "another field.\n", i, argv[i]);
-                        ret = -EINVAL;
-                        goto error;
-                }
-                /* Parse fetch argument */
-                ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false);
-                if (ret) {
-                        pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
-                        goto error;
-                }
-        }
-        ret = register_trace_uprobe(tu);
-        if (ret)
-                goto error;
-        return 0;
-error:
-        free_trace_uprobe(tu);
-        return ret;
-fail_address_parse:
-        if (inode)
-                iput(inode);
-        pr_info("Failed to parse address.\n");
-        return ret;
-}
-static void cleanup_all_probes(void)
-{
-        struct trace_uprobe *tu;
-        mutex_lock(&uprobe_lock);
-        while (!list_empty(&uprobe_list)) {
-                tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
-                unregister_trace_uprobe(tu);
-        }
-        mutex_unlock(&uprobe_lock);
-}
-/* Probes listing interfaces */
-static void *probes_seq_start(struct seq_file *m, loff_t *pos)
-{
-        mutex_lock(&uprobe_lock);
-        return seq_list_start(&uprobe_list, *pos);
-}
-static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
-{
-        return seq_list_next(v, &uprobe_list, pos);
-}
-static void probes_seq_stop(struct seq_file *m, void *v)
-{
-        mutex_unlock(&uprobe_lock);
-}
-static int probes_seq_show(struct seq_file *m, void *v)
-{
-        struct trace_uprobe *tu = v;
-        int i;
-        seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name);
-        seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
-        for (i = 0; i < tu->nr_args; i++)
-                seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm);
-        seq_printf(m, "\n");
-        return 0;
-}
-static const struct seq_operations probes_seq_op = {
-        .start  = probes_seq_start,
-        .next   = probes_seq_next,
-        .stop   = probes_seq_stop,
-        .show   = probes_seq_show
-};
-static int probes_open(struct inode *inode, struct file *file)
-{
-        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
-                cleanup_all_probes();
-        return seq_open(file, &probes_seq_op);
-}
-static ssize_t probes_write(struct file *file, const char __user *buffer,
-                            size_t count, loff_t *ppos)
-{
-        return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
-}
-static const struct file_operations uprobe_events_ops = {
-        .owner          = THIS_MODULE,
-        .open           = probes_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = seq_release,
-        .write          = probes_write,
-};
-/* Probes profiling interfaces */
-static int probes_profile_seq_show(struct seq_file *m, void *v)
-{
-        struct trace_uprobe *tu = v;
-        seq_printf(m, "  %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit);
-        return 0;
-}
-static const struct seq_operations profile_seq_op = {
-        .start  = probes_seq_start,
-        .next   = probes_seq_next,
-        .stop   = probes_seq_stop,
-        .show   = probes_profile_seq_show
-};
-static int profile_open(struct inode *inode, struct file *file)
-{
-        return seq_open(file, &profile_seq_op);
-}
-static const struct file_operations uprobe_profile_ops = {
-        .owner          = THIS_MODULE,
-        .open           = profile_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = seq_release,
-};
-/* uprobe handler */
-static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
-{
-        struct uprobe_trace_entry_head *entry;
-        struct ring_buffer_event *event;
-        struct ring_buffer *buffer;
-        u8 *data;
-        int size, i, pc;
-        unsigned long irq_flags;
-        struct ftrace_event_call *call = &tu->call;
-        tu->nhit++;
-        local_save_flags(irq_flags);
-        pc = preempt_count();
-        size = sizeof(*entry) + tu->size;
-        event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
-                                                  size, irq_flags, pc);
-        if (!event)
-                return;
-        entry = ring_buffer_event_data(event);
-        entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
-        data = (u8 *)&entry[1];
-        for (i = 0; i < tu->nr_args; i++)
-                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
-        if (!filter_current_check_discard(buffer, call, entry, event))
-                trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
-}
-/* Event entry printers */
-static enum print_line_t
-print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
-{
-        struct uprobe_trace_entry_head *field;
-        struct trace_seq *s = &iter->seq;
-        struct trace_uprobe *tu;
-        u8 *data;
-        int i;
-        field = (struct uprobe_trace_entry_head *)iter->ent;
-        tu = container_of(event, struct trace_uprobe, call.event);
-        if (!trace_seq_printf(s, "%s: (", tu->call.name))
-                goto partial;
-        if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
-                goto partial;
-        if (!trace_seq_puts(s, ")"))
-                goto partial;
-        data = (u8 *)&field[1];
-        for (i = 0; i < tu->nr_args; i++) {
-                if (!tu->args[i].type->print(s, tu->args[i].name,
-                                             data + tu->args[i].offset, field))
-                        goto partial;
-        }
-        if (trace_seq_puts(s, "\n"))
-                return TRACE_TYPE_HANDLED;
-partial:
-        return TRACE_TYPE_PARTIAL_LINE;
-}
-static int probe_event_enable(struct trace_uprobe *tu, int flag)
-{
-        struct uprobe_trace_consumer *utc;
-        int ret = 0;
-        if (!tu->inode || tu->consumer)
-                return -EINTR;
-        utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
-        if (!utc)
-                return -EINTR;
-        utc->cons.handler = uprobe_dispatcher;
-        utc->cons.filter = NULL;
-        ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
-        if (ret) {
-                kfree(utc);
-                return ret;
-        }
-        tu->flags |= flag;
-        utc->tu = tu;
-        tu->consumer = utc;
-        return 0;
-}
-static void probe_event_disable(struct trace_uprobe *tu, int flag)
-{
-        if (!tu->inode || !tu->consumer)
-                return;
-        uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
-        tu->flags &= ~flag;
-        kfree(tu->consumer);
-        tu->consumer = NULL;
-}
-static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
-{
-        int ret, i;
-        struct uprobe_trace_entry_head field;
-        struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data;
-        DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
-        /* Set argument names as fields */
-        for (i = 0; i < tu->nr_args; i++) {
-                ret = trace_define_field(event_call, tu->args[i].type->fmttype,
-                                         tu->args[i].name,
-                                         sizeof(field) + tu->args[i].offset,
-                                         tu->args[i].type->size,
-                                         tu->args[i].type->is_signed,
-                                         FILTER_OTHER);
-                if (ret)
-                        return ret;
-        }
-        return 0;
-}
-#define LEN_OR_ZERO             (len ? len - pos : 0)
-static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
-{
-        const char *fmt, *arg;
-        int i;
-        int pos = 0;
-        fmt = "(%lx)";
-        arg = "REC->" FIELD_STRING_IP;
-        /* When len=0, we just calculate the needed length */
-        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
-        for (i = 0; i < tu->nr_args; i++) {
-                pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
-                                tu->args[i].name, tu->args[i].type->fmt);
-        }
-        pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
-        for (i = 0; i < tu->nr_args; i++) {
-                pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
-                                tu->args[i].name);
-        }
-        return pos;     /* return the length of print_fmt */
-}
-#undef LEN_OR_ZERO
-static int set_print_fmt(struct trace_uprobe *tu)
-{
-        char *print_fmt;
-        int len;
-        /* First: called with 0 length to calculate the needed length */
-        len = __set_print_fmt(tu, NULL, 0);
-        print_fmt = kmalloc(len + 1, GFP_KERNEL);
-        if (!print_fmt)
-                return -ENOMEM;
-        /* Second: actually write the @print_fmt */
-        __set_print_fmt(tu, print_fmt, len + 1);
-        tu->call.print_fmt = print_fmt;
-        return 0;
-}
-#ifdef CONFIG_PERF_EVENTS
-/* uprobe profile handler */
-static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
-{
-        struct ftrace_event_call *call = &tu->call;
-        struct uprobe_trace_entry_head *entry;
-        struct hlist_head *head;
-        u8 *data;
-        int size, __size, i;
-        int rctx;
-        __size = sizeof(*entry) + tu->size;
-        size = ALIGN(__size + sizeof(u32), sizeof(u64));
-        size -= sizeof(u32);
-        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
-                return;
-        preempt_disable();
-        entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
-        if (!entry)
-                goto out;
-        entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
-        data = (u8 *)&entry[1];
-        for (i = 0; i < tu->nr_args; i++)
-                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
-        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
- out:
-        preempt_enable();
-}
-#endif  /* CONFIG_PERF_EVENTS */
-static
-int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
-{
-        struct trace_uprobe *tu = (struct trace_uprobe *)event->data;
-        switch (type) {
-        case TRACE_REG_REGISTER:
-                return probe_event_enable(tu, TP_FLAG_TRACE);
-        case TRACE_REG_UNREGISTER:
-                probe_event_disable(tu, TP_FLAG_TRACE);
-                return 0;
-#ifdef CONFIG_PERF_EVENTS
-        case TRACE_REG_PERF_REGISTER:
-                return probe_event_enable(tu, TP_FLAG_PROFILE);
-        case TRACE_REG_PERF_UNREGISTER:
-                probe_event_disable(tu, TP_FLAG_PROFILE);
-                return 0;
-#endif
-        default:
-                return 0;
-        }
-        return 0;
-}
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
-{
-        struct uprobe_trace_consumer *utc;
-        struct trace_uprobe *tu;
-        utc = container_of(con, struct uprobe_trace_consumer, cons);
-        tu = utc->tu;
-        if (!tu || tu->consumer != utc)
-                return 0;
-        if (tu->flags & TP_FLAG_TRACE)
-                uprobe_trace_func(tu, regs);
-#ifdef CONFIG_PERF_EVENTS
-        if (tu->flags & TP_FLAG_PROFILE)
-                uprobe_perf_func(tu, regs);
-#endif
-        return 0;
-}
-static struct trace_event_functions uprobe_funcs = {
-        .trace          = print_uprobe_event
-};
-static int register_uprobe_event(struct trace_uprobe *tu)
-{
-        struct ftrace_event_call *call = &tu->call;
-        int ret;
-        /* Initialize ftrace_event_call */
-        INIT_LIST_HEAD(&call->class->fields);
-        call->event.funcs = &uprobe_funcs;
-        call->class->define_fields = uprobe_event_define_fields;
-        if (set_print_fmt(tu) < 0)
-                return -ENOMEM;
-        ret = register_ftrace_event(&call->event);
-        if (!ret) {
-                kfree(call->print_fmt);
-                return -ENODEV;
-        }
-        call->flags = 0;
-        call->class->reg = trace_uprobe_register;
-        call->data = tu;
-        ret = trace_add_event_call(call);
-        if (ret) {
-                pr_info("Failed to register uprobe event: %s\n", call->name);
-                kfree(call->print_fmt);
-                unregister_ftrace_event(&call->event);
-        }
-        return ret;
-}
-static void unregister_uprobe_event(struct trace_uprobe *tu)
-{
-        /* tu->event is unregistered in trace_remove_event_call() */
-        trace_remove_event_call(&tu->call);
-        kfree(tu->call.print_fmt);
-        tu->call.print_fmt = NULL;
-}
-/* Make a trace interface for controling probe points */
-static __init int init_uprobe_trace(void)
-{
-        struct dentry *d_tracer;
-        d_tracer = tracing_init_dentry();
-        if (!d_tracer)
-                return 0;
-        trace_create_file("uprobe_events", 0644, d_tracer,
-                                    NULL, &uprobe_events_ops);
-        /* Profile interface */
-        trace_create_file("uprobe_profile", 0444, d_tracer,
-                                    NULL, &uprobe_profile_ops);
-        return 0;
-}
-fs_initcall(init_uprobe_trace);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d96ba22dabf..b219f1449c5 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,7 +25,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
-#include <linux/static_key.h>
+#include <linux/jump_label.h>
 extern struct tracepoint * const __start___tracepoints_ptrs[];
 extern struct tracepoint * const __stop___tracepoints_ptrs[];
@@ -34,16 +34,11 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
 static const int tracepoint_debug;
 /*
- * Tracepoints mutex protects the builtin and module tracepoints and the hash
+ * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the
- * table, as well as the local module list.
+ * builtin and module tracepoints and the hash table.
 */
 static DEFINE_MUTEX(tracepoints_mutex);
-#ifdef CONFIG_MODULES
-/* Local list of struct module */
-static LIST_HEAD(tracepoint_module_list);
-#endif /* CONFIG_MODULES */
 /*
 * Tracepoint hash table, containing the active tracepoints.
 * Protected by tracepoints_mutex.
@@ -256,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 {
        WARN_ON(strcmp((*entry)->name, elem->name) != 0);
-        if (elem->regfunc && !static_key_enabled(&elem->key) && active)
+        if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
                elem->regfunc();
-        else if (elem->unregfunc && static_key_enabled(&elem->key) && !active)
+        else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
                elem->unregfunc();
        /*
@@ -269,10 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
         * is used.
         */
        rcu_assign_pointer(elem->funcs, (*entry)->funcs);
-        if (active && !static_key_enabled(&elem->key))
+        if (active && !jump_label_enabled(&elem->key))
-                static_key_slow_inc(&elem->key);
+                jump_label_inc(&elem->key);
-        else if (!active && static_key_enabled(&elem->key))
+        else if (!active && jump_label_enabled(&elem->key))
-                static_key_slow_dec(&elem->key);
+                jump_label_dec(&elem->key);
 }
 /*
@@ -283,11 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 */
 static void disable_tracepoint(struct tracepoint *elem)
 {
-        if (elem->unregfunc && static_key_enabled(&elem->key))
+        if (elem->unregfunc && jump_label_enabled(&elem->key))
                elem->unregfunc();
-        if (static_key_enabled(&elem->key))
+        if (jump_label_enabled(&elem->key))
-                static_key_slow_dec(&elem->key);
+                jump_label_dec(&elem->key);
        rcu_assign_pointer(elem->funcs, NULL);
 }
@@ -297,10 +292,9 @@ static void disable_tracepoint(struct tracepoint *elem)
 * @end: end of the range
 *
 * Updates the probe callback corresponding to a range of tracepoints.
- * Called with tracepoints_mutex held.
 */
-static void tracepoint_update_probe_range(struct tracepoint * const *begin,
+void tracepoint_update_probe_range(struct tracepoint * const *begin,
-                                          struct tracepoint * const *end)
+                                   struct tracepoint * const *end)
 {
        struct tracepoint * const *iter;
        struct tracepoint_entry *mark_entry;
@@ -308,6 +302,7 @@ static void tracepoint_update_probe_range(struct tracepoint * const *begin,
        if (!begin)
                return;
+        mutex_lock(&tracepoints_mutex);
        for (iter = begin; iter < end; iter++) {
                mark_entry = get_tracepoint((*iter)->name);
                if (mark_entry) {
@@ -317,27 +312,11 @@ static void tracepoint_update_probe_range(struct tracepoint * const *begin,
                        disable_tracepoint(*iter);
                }
        }
+        mutex_unlock(&tracepoints_mutex);
 }
-#ifdef CONFIG_MODULES
-void module_update_tracepoints(void)
-{
-        struct tp_module *tp_mod;
-        list_for_each_entry(tp_mod, &tracepoint_module_list, list)
-                tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
-                        tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
-}
-#else /* CONFIG_MODULES */
-void module_update_tracepoints(void)
-{
-}
-#endif /* CONFIG_MODULES */
 /*
 * Update probes, removing the faulty probes.
- * Called with tracepoints_mutex held.
 */
 static void tracepoint_update_probes(void)
 {
@@ -380,12 +359,11 @@ int tracepoint_probe_register(const char *name, void *probe, void *data)
        mutex_lock(&tracepoints_mutex);
        old = tracepoint_add_probe(name, probe, data);
-        if (IS_ERR(old)) {
+        mutex_unlock(&tracepoints_mutex);
-                mutex_unlock(&tracepoints_mutex);
+        if (IS_ERR(old))
                return PTR_ERR(old);
-        }
        tracepoint_update_probes();             /* may update entry */
-        mutex_unlock(&tracepoints_mutex);
        release_probes(old);
        return 0;
 }
@@ -424,12 +402,11 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data)
        mutex_lock(&tracepoints_mutex);
        old = tracepoint_remove_probe(name, probe, data);
-        if (IS_ERR(old)) {
+        mutex_unlock(&tracepoints_mutex);
-                mutex_unlock(&tracepoints_mutex);
+        if (IS_ERR(old))
                return PTR_ERR(old);
-        }
        tracepoint_update_probes();             /* may update entry */
-        mutex_unlock(&tracepoints_mutex);
        release_probes(old);
        return 0;
 }
@@ -512,8 +489,9 @@ void tracepoint_probe_update_all(void)
        if (!list_empty(&old_probes))
                list_replace_init(&old_probes, &release_probes);
        need_update = 0;
-        tracepoint_update_probes();
        mutex_unlock(&tracepoints_mutex);
+        tracepoint_update_probes();
        list_for_each_entry_safe(pos, next, &release_probes, u.list) {
                list_del(&pos->u.list);
                call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
@@ -531,7 +509,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
 * Will return the first tracepoint in the range if the input tracepoint is
 * NULL.
 */
-static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
+int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
        struct tracepoint * const *begin, struct tracepoint * const *end)
 {
        if (!*tracepoint && begin != end) {
@@ -542,12 +520,11 @@ static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
                return 1;
        return 0;
 }
+EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
-#ifdef CONFIG_MODULES
 static void tracepoint_get_iter(struct tracepoint_iter *iter)
 {
        int found = 0;
-        struct tp_module *iter_mod;
        /* Core kernel tracepoints */
        if (!iter->module) {
@@ -557,43 +534,12 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
                if (found)
                        goto end;
        }
-        /* Tracepoints in modules */
+        /* tracepoints in modules. */
-        mutex_lock(&tracepoints_mutex);
+        found = module_get_iter_tracepoints(iter);
-        list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
-                /*
-                 * Sorted module list
-                 */
-                if (iter_mod < iter->module)
-                        continue;
-                else if (iter_mod > iter->module)
-                        iter->tracepoint = NULL;
-                found = tracepoint_get_iter_range(&iter->tracepoint,
-                        iter_mod->tracepoints_ptrs,
-                        iter_mod->tracepoints_ptrs
-                                + iter_mod->num_tracepoints);
-                if (found) {
-                        iter->module = iter_mod;
-                        break;
-                }
-        }
-        mutex_unlock(&tracepoints_mutex);
 end:
        if (!found)
                tracepoint_iter_reset(iter);
 }
-#else /* CONFIG_MODULES */
-static void tracepoint_get_iter(struct tracepoint_iter *iter)
-{
-        int found = 0;
-        /* Core kernel tracepoints */
-        found = tracepoint_get_iter_range(&iter->tracepoint,
-                        __start___tracepoints_ptrs,
-                        __stop___tracepoints_ptrs);
-        if (!found)
-                tracepoint_iter_reset(iter);
-}
-#endif /* CONFIG_MODULES */
 void tracepoint_iter_start(struct tracepoint_iter *iter)
 {
@@ -620,99 +566,26 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
 void tracepoint_iter_reset(struct tracepoint_iter *iter)
 {
-#ifdef CONFIG_MODULES
        iter->module = NULL;
-#endif /* CONFIG_MODULES */
        iter->tracepoint = NULL;
 }
 EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
 #ifdef CONFIG_MODULES
-static int tracepoint_module_coming(struct module *mod)
-{
-        struct tp_module *tp_mod, *iter;
-        int ret = 0;
-        /*
-         * We skip modules that taint the kernel, especially those with different
-         * module headers (for forced load), to make sure we don't cause a crash.
-         * Staging and out-of-tree GPL modules are fine.
-         */
-        if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)))
-                return 0;
-        mutex_lock(&tracepoints_mutex);
-        tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
-        if (!tp_mod) {
-                ret = -ENOMEM;
-                goto end;
-        }
-        tp_mod->num_tracepoints = mod->num_tracepoints;
-        tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
-        /*
-         * tracepoint_module_list is kept sorted by struct module pointer
-         * address for iteration on tracepoints from a seq_file that can release
-         * the mutex between calls.
-         */
-        list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
-                BUG_ON(iter == tp_mod); /* Should never be in the list twice */
-                if (iter < tp_mod) {
-                        /* We belong to the location right after iter. */
-                        list_add(&tp_mod->list, &iter->list);
-                        goto module_added;
-                }
-        }
-        /* We belong to the beginning of the list */
-        list_add(&tp_mod->list, &tracepoint_module_list);
-module_added:
-        tracepoint_update_probe_range(mod->tracepoints_ptrs,
-                mod->tracepoints_ptrs + mod->num_tracepoints);
-end:
-        mutex_unlock(&tracepoints_mutex);
-        return ret;
-}
-static int tracepoint_module_going(struct module *mod)
-{
-        struct tp_module *pos;
-        mutex_lock(&tracepoints_mutex);
-        tracepoint_update_probe_range(mod->tracepoints_ptrs,
-                mod->tracepoints_ptrs + mod->num_tracepoints);
-        list_for_each_entry(pos, &tracepoint_module_list, list) {
-                if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
-                        list_del(&pos->list);
-                        kfree(pos);
-                        break;
-                }
-        }
-        /*
-         * In the case of modules that were tainted at "coming", we'll simply
-         * walk through the list without finding it. We cannot use the "tainted"
-         * flag on "going", in case a module taints the kernel only after being
-         * loaded.
-         */
-        mutex_unlock(&tracepoints_mutex);
-        return 0;
-}
 int tracepoint_module_notify(struct notifier_block *self,
                             unsigned long val, void *data)
 {
        struct module *mod = data;
-        int ret = 0;
        switch (val) {
        case MODULE_STATE_COMING:
-                ret = tracepoint_module_coming(mod);
-                break;
-        case MODULE_STATE_LIVE:
-                break;
        case MODULE_STATE_GOING:
-                ret = tracepoint_module_going(mod);
+                tracepoint_update_probe_range(mod->tracepoints_ptrs,
+                        mod->tracepoints_ptrs + mod->num_tracepoints);
                break;
        }
-        return ret;
+        return 0;
 }
 struct notifier_block tracepoint_module_nb = {
@@ -725,6 +598,7 @@ static int init_tracepoints(void)
        return register_module_notifier(&tracepoint_module_nb);
 }
 __initcall(init_tracepoints);
 #endif /* CONFIG_MODULES */
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 625df0b4469..5bbfac85866 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -26,9 +26,7 @@
 /*
 * fill in basic accounting fields
 */
-void bacct_add_tsk(struct user_namespace *user_ns,
+void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
-                   struct pid_namespace *pid_ns,
-                   struct taskstats *stats, struct task_struct *tsk)
 {
        const struct cred *tcred;
        struct timespec uptime, ts;
@@ -57,13 +55,13 @@ void bacct_add_tsk(struct user_namespace *user_ns,
                stats->ac_flag |= AXSIG;
        stats->ac_nice   = task_nice(tsk);
        stats->ac_sched  = tsk->policy;
-        stats->ac_pid    = task_pid_nr_ns(tsk, pid_ns);
+        stats->ac_pid    = tsk->pid;
        rcu_read_lock();
        tcred = __task_cred(tsk);
-        stats->ac_uid    = from_kuid_munged(user_ns, tcred->uid);
+        stats->ac_uid    = tcred->uid;
-        stats->ac_gid    = from_kgid_munged(user_ns, tcred->gid);
+        stats->ac_gid    = tcred->gid;
        stats->ac_ppid   = pid_alive(tsk) ?
-                task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
+                                rcu_dereference(tsk->real_parent)->tgid : 0;
        rcu_read_unlock();
        stats->ac_utime = cputime_to_usecs(tsk->utime);
        stats->ac_stime = cputime_to_usecs(tsk->stime);
@@ -129,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk)
                local_irq_save(flags);
                time = tsk->stime + tsk->utime;
-                dtime = time - tsk->acct_timexpd;
+                dtime = cputime_sub(time, tsk->acct_timexpd);
                jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
                delta = value.tv_sec;
                delta = delta * USEC_PER_SEC + value.tv_usec;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index d7948eb1022..51c6e89e861 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -81,19 +81,14 @@ SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
        return ret;
 }
-SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
+SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid)
 {
        const struct cred *cred = current_cred();
        int retval;
-        old_uid_t ruid, euid, suid;
-        ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid));
+        if (!(retval   = put_user(high2lowuid(cred->uid),  ruid)) &&
-        euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid));
+            !(retval   = put_user(high2lowuid(cred->euid), euid)))
-        suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid));
+                retval = put_user(high2lowuid(cred->suid), suid);
-        if (!(retval   = put_user(ruid, ruidp)) &&
-            !(retval   = put_user(euid, euidp)))
-                retval = put_user(suid, suidp);
        return retval;
 }
@@ -108,19 +103,14 @@ SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
 }
-SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp)
+SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid)
 {
        const struct cred *cred = current_cred();
        int retval;
-        old_gid_t rgid, egid, sgid;
-        rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid));
-        egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid));
-        sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid));
-        if (!(retval   = put_user(rgid, rgidp)) &&
+        if (!(retval   = put_user(high2lowgid(cred->gid),  rgid)) &&
-            !(retval   = put_user(egid, egidp)))
+            !(retval   = put_user(high2lowgid(cred->egid), egid)))
-                retval = put_user(sgid, sgidp);
+                retval = put_user(high2lowgid(cred->sgid), sgid);
        return retval;
 }
@@ -144,14 +134,11 @@ SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
 static int groups16_to_user(old_gid_t __user *grouplist,
    struct group_info *group_info)
 {
-        struct user_namespace *user_ns = current_user_ns();
        int i;
        old_gid_t group;
-        kgid_t kgid;
        for (i = 0; i < group_info->ngroups; i++) {
-                kgid = GROUP_AT(group_info, i);
+                group = high2lowgid(GROUP_AT(group_info, i));
-                group = high2lowgid(from_kgid_munged(user_ns, kgid));
                if (put_user(group, grouplist+i))
                        return -EFAULT;
        }
@@ -162,20 +149,13 @@ static int groups16_to_user(old_gid_t __user *grouplist,
 static int groups16_from_user(struct group_info *group_info,
    old_gid_t __user *grouplist)
 {
-        struct user_namespace *user_ns = current_user_ns();
        int i;
        old_gid_t group;
-        kgid_t kgid;
        for (i = 0; i < group_info->ngroups; i++) {
                if (get_user(group, grouplist+i))
                        return  -EFAULT;
+                GROUP_AT(group_info, i) = low2highgid(group);
-                kgid = make_kgid(user_ns, low2highgid(group));
-                if (!gid_valid(kgid))
-                        return -EINVAL;
-                GROUP_AT(group_info, i) = kgid;
        }
        return 0;
@@ -231,20 +211,20 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
 SYSCALL_DEFINE0(getuid16)
 {
-        return high2lowuid(from_kuid_munged(current_user_ns(), current_uid()));
+        return high2lowuid(current_uid());
 }
 SYSCALL_DEFINE0(geteuid16)
 {
-        return high2lowuid(from_kuid_munged(current_user_ns(), current_euid()));
+        return high2lowuid(current_euid());
 }
 SYSCALL_DEFINE0(getgid16)
 {
-        return high2lowgid(from_kgid_munged(current_user_ns(), current_gid()));
+        return high2lowgid(current_gid());
 }
 SYSCALL_DEFINE0(getegid16)
 {
-        return high2lowgid(from_kgid_munged(current_user_ns(), current_egid()));
+        return high2lowgid(current_egid());
 }
diff --git a/kernel/up.c b/kernel/up.c
index c54c75e9faf..1ff27a28bb7 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -4,7 +4,7 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/smp.h>
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 1744bb80f1f..92cb706c7fc 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -2,7 +2,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
-#include <linux/export.h>
+#include <linux/module.h>
 static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
diff --git a/kernel/user.c b/kernel/user.c
index 33acb5e53a5..9e03e9c1df8 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -14,45 +14,18 @@
 #include <linux/bitops.h>
 #include <linux/key.h>
 #include <linux/interrupt.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
 /*
 * userns count is 1 for root user, 1 for init_uts_ns,
 * and 1 for... ?
 */
 struct user_namespace init_user_ns = {
-        .uid_map = {
-                .nr_extents = 1,
-                .extent[0] = {
-                        .first = 0,
-                        .lower_first = 0,
-                        .count = 4294967295U,
-                },
-        },
-        .gid_map = {
-                .nr_extents = 1,
-                .extent[0] = {
-                        .first = 0,
-                        .lower_first = 0,
-                        .count = 4294967295U,
-                },
-        },
-        .projid_map = {
-                .nr_extents = 1,
-                .extent[0] = {
-                        .first = 0,
-                        .lower_first = 0,
-                        .count = 4294967295U,
-                },
-        },
        .kref = {
                .refcount       = ATOMIC_INIT(3),
        },
-        .owner = GLOBAL_ROOT_UID,
+        .creator = &root_user,
-        .group = GLOBAL_ROOT_GID,
-        .proc_inum = PROC_USER_INIT_INO,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
@@ -61,14 +34,11 @@ EXPORT_SYMBOL_GPL(init_user_ns);
 * when changing user ID's (ie setuid() and friends).
 */
-#define UIDHASH_BITS    (CONFIG_BASE_SMALL ? 3 : 7)
-#define UIDHASH_SZ      (1 << UIDHASH_BITS)
 #define UIDHASH_MASK            (UIDHASH_SZ - 1)
 #define __uidhashfn(uid)        (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
-#define uidhashentry(uid)       (uidhash_table + __uidhashfn((__kuid_val(uid))))
+#define uidhashentry(ns, uid)   ((ns)->uidhash_table + __uidhashfn((uid)))
 static struct kmem_cache *uid_cachep;
-struct hlist_head uidhash_table[UIDHASH_SZ];
 /*
 * The uidhash_lock is mostly taken from process context, but it is
@@ -81,14 +51,14 @@ struct hlist_head uidhash_table[UIDHASH_SZ];
 */
 static DEFINE_SPINLOCK(uidhash_lock);
-/* root_user.__count is 1, for init task cred */
+/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
 struct user_struct root_user = {
-        .__count        = ATOMIC_INIT(1),
+        .__count        = ATOMIC_INIT(2),
        .processes      = ATOMIC_INIT(1),
        .files          = ATOMIC_INIT(0),
        .sigpending     = ATOMIC_INIT(0),
        .locked_shm     = 0,
-        .uid            = GLOBAL_ROOT_UID,
+        .user_ns        = &init_user_ns,
 };
 /*
@@ -102,15 +72,16 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
 static void uid_hash_remove(struct user_struct *up)
 {
        hlist_del_init(&up->uidhash_node);
+        put_user_ns(up->user_ns);
 }
-static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
+static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 {
        struct user_struct *user;
        struct hlist_node *h;
        hlist_for_each_entry(user, h, hashent, uidhash_node) {
-                if (uid_eq(user->uid, uid)) {
+                if (user->uid == uid) {
                        atomic_inc(&user->__count);
                        return user;
                }
@@ -139,13 +110,14 @@ static void free_user(struct user_struct *up, unsigned long flags)
 *
 * If the user_struct could not be found, return NULL.
 */
-struct user_struct *find_user(kuid_t uid)
+struct user_struct *find_user(uid_t uid)
 {
        struct user_struct *ret;
        unsigned long flags;
+        struct user_namespace *ns = current_user_ns();
        spin_lock_irqsave(&uidhash_lock, flags);
-        ret = uid_hash_find(uid, uidhashentry(uid));
+        ret = uid_hash_find(uid, uidhashentry(ns, uid));
        spin_unlock_irqrestore(&uidhash_lock, flags);
        return ret;
 }
@@ -164,9 +136,9 @@ void free_uid(struct user_struct *up)
                local_irq_restore(flags);
 }
-struct user_struct *alloc_uid(kuid_t uid)
+struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 {
-        struct hlist_head *hashent = uidhashentry(uid);
+        struct hlist_head *hashent = uidhashentry(ns, uid);
        struct user_struct *up, *new;
        spin_lock_irq(&uidhash_lock);
@@ -181,6 +153,8 @@ struct user_struct *alloc_uid(kuid_t uid)
                new->uid = uid;
                atomic_set(&new->__count, 1);
+                new->user_ns = get_user_ns(ns);
                /*
                 * Before adding this, check whether we raced
                 * on adding the same user already..
@@ -188,6 +162,7 @@ struct user_struct *alloc_uid(kuid_t uid)
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
+                        put_user_ns(ns);
                        key_put(new->uid_keyring);
                        key_put(new->session_keyring);
                        kmem_cache_free(uid_cachep, new);
@@ -212,11 +187,11 @@ static int __init uid_cache_init(void)
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
        for(n = 0; n < UIDHASH_SZ; ++n)
-                INIT_HLIST_HEAD(uidhash_table + n);
+                INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
        /* Insert the root user immediately (init already runs as root) */
        spin_lock_irq(&uidhash_lock);
-        uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
+        uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0));
        spin_unlock_irq(&uidhash_lock);
        return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2b042c42fbc..9da289c34f2 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -5,46 +5,15 @@
 *  License.
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
 #include <linux/highuid.h>
 #include <linux/cred.h>
-#include <linux/securebits.h>
-#include <linux/keyctl.h>
-#include <linux/key-type.h>
-#include <keys/user-type.h>
-#include <linux/seq_file.h>
-#include <linux/fs.h>
-#include <linux/uaccess.h>
-#include <linux/ctype.h>
-#include <linux/projid.h>
 static struct kmem_cache *user_ns_cachep __read_mostly;
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
-                                struct uid_gid_map *map);
-static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
-{
-        /* Start with the same capabilities as init but useless for doing
-         * anything as the capabilities are bound to the new user namespace.
-         */
-        cred->securebits = SECUREBITS_DEFAULT;
-        cred->cap_inheritable = CAP_EMPTY_SET;
-        cred->cap_permitted = CAP_FULL_SET;
-        cred->cap_effective = CAP_FULL_SET;
-        cred->cap_bset = CAP_FULL_SET;
-#ifdef CONFIG_KEYS
-        key_put(cred->request_key_auth);
-        cred->request_key_auth = NULL;
-#endif
-        /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
-        cred->user_ns = user_ns;
-}
 /*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
@@ -55,782 +24,111 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
 */
 int create_user_ns(struct cred *new)
 {
-        struct user_namespace *ns, *parent_ns = new->user_ns;
+        struct user_namespace *ns;
-        kuid_t owner = new->euid;
+        struct user_struct *root_user;
-        kgid_t group = new->egid;
+        int n;
-        int ret;
-        /* The creator needs a mapping in the parent user namespace
+        ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
-         * or else we won't be able to reasonably tell userspace who
-         * created a user_namespace.
-         */
-        if (!kuid_has_mapping(parent_ns, owner) ||
-            !kgid_has_mapping(parent_ns, group))
-                return -EPERM;
-        ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
        if (!ns)
                return -ENOMEM;
-        ret = proc_alloc_inum(&ns->proc_inum);
-        if (ret) {
-                kmem_cache_free(user_ns_cachep, ns);
-                return ret;
-        }
        kref_init(&ns->kref);
-        /* Leave the new->user_ns reference with the new user namespace. */
-        ns->parent = parent_ns;
-        ns->owner = owner;
-        ns->group = group;
-        set_cred_user_ns(new, ns);
+        for (n = 0; n < UIDHASH_SZ; ++n)
+                INIT_HLIST_HEAD(ns->uidhash_table + n);
-        return 0;
+        /* Alloc new root user.  */
-}
+        root_user = alloc_uid(ns, 0);
+        if (!root_user) {
-int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
+                kmem_cache_free(user_ns_cachep, ns);
-{
-        struct cred *cred;
-        if (!(unshare_flags & CLONE_NEWUSER))
-                return 0;
-        cred = prepare_creds();
-        if (!cred)
                return -ENOMEM;
-        *new_cred = cred;
-        return create_user_ns(cred);
-}
-void free_user_ns(struct kref *kref)
-{
-        struct user_namespace *parent, *ns =
-                container_of(kref, struct user_namespace, kref);
-        parent = ns->parent;
-        proc_free_inum(ns->proc_inum);
-        kmem_cache_free(user_ns_cachep, ns);
-        put_user_ns(parent);
-}
-EXPORT_SYMBOL(free_user_ns);
-static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
-{
-        unsigned idx, extents;
-        u32 first, last, id2;
-        id2 = id + count - 1;
-        /* Find the matching extent */
-        extents = map->nr_extents;
-        smp_read_barrier_depends();
-        for (idx = 0; idx < extents; idx++) {
-                first = map->extent[idx].first;
-                last = first + map->extent[idx].count - 1;
-                if (id >= first && id <= last &&
-                    (id2 >= first && id2 <= last))
-                        break;
        }
-        /* Map the id or note failure */
-        if (idx < extents)
-                id = (id - first) + map->extent[idx].lower_first;
-        else
-                id = (u32) -1;
-        return id;
-}
-static u32 map_id_down(struct uid_gid_map *map, u32 id)
-{
-        unsigned idx, extents;
-        u32 first, last;
-        /* Find the matching extent */
-        extents = map->nr_extents;
-        smp_read_barrier_depends();
-        for (idx = 0; idx < extents; idx++) {
-                first = map->extent[idx].first;
-                last = first + map->extent[idx].count - 1;
-                if (id >= first && id <= last)
-                        break;
-        }
-        /* Map the id or note failure */
-        if (idx < extents)
-                id = (id - first) + map->extent[idx].lower_first;
-        else
-                id = (u32) -1;
-        return id;
-}
-static u32 map_id_up(struct uid_gid_map *map, u32 id)
-{
-        unsigned idx, extents;
-        u32 first, last;
-        /* Find the matching extent */
-        extents = map->nr_extents;
-        smp_read_barrier_depends();
-        for (idx = 0; idx < extents; idx++) {
-                first = map->extent[idx].lower_first;
-                last = first + map->extent[idx].count - 1;
-                if (id >= first && id <= last)
-                        break;
-        }
-        /* Map the id or note failure */
-        if (idx < extents)
-                id = (id - first) + map->extent[idx].first;
-        else
-                id = (u32) -1;
-        return id;
-}
-/**
- *      make_kuid - Map a user-namespace uid pair into a kuid.
- *      @ns:  User namespace that the uid is in
- *      @uid: User identifier
- *
- *      Maps a user-namespace uid pair into a kernel internal kuid,
- *      and returns that kuid.
- *
- *      When there is no mapping defined for the user-namespace uid
- *      pair INVALID_UID is returned.  Callers are expected to test
- *      for and handle handle INVALID_UID being returned.  INVALID_UID
- *      may be tested for using uid_valid().
- */
-kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
-{
-        /* Map the uid to a global kernel uid */
-        return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
-}
-EXPORT_SYMBOL(make_kuid);
-/**
- *      from_kuid - Create a uid from a kuid user-namespace pair.
- *      @targ: The user namespace we want a uid in.
- *      @kuid: The kernel internal uid to start with.
- *
- *      Map @kuid into the user-namespace specified by @targ and
- *      return the resulting uid.
- *
- *      There is always a mapping into the initial user_namespace.
- *
- *      If @kuid has no mapping in @targ (uid_t)-1 is returned.
- */
-uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
-{
-        /* Map the uid from a global kernel uid */
-        return map_id_up(&targ->uid_map, __kuid_val(kuid));
-}
-EXPORT_SYMBOL(from_kuid);
-/**
- *      from_kuid_munged - Create a uid from a kuid user-namespace pair.
- *      @targ: The user namespace we want a uid in.
- *      @kuid: The kernel internal uid to start with.
- *
- *      Map @kuid into the user-namespace specified by @targ and
- *      return the resulting uid.
- *
- *      There is always a mapping into the initial user_namespace.
- *
- *      Unlike from_kuid from_kuid_munged never fails and always
- *      returns a valid uid.  This makes from_kuid_munged appropriate
- *      for use in syscalls like stat and getuid where failing the
- *      system call and failing to provide a valid uid are not an
- *      options.
- *
- *      If @kuid has no mapping in @targ overflowuid is returned.
- */
-uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
-{
-        uid_t uid;
-        uid = from_kuid(targ, kuid);
-        if (uid == (uid_t) -1)
-                uid = overflowuid;
-        return uid;
-}
-EXPORT_SYMBOL(from_kuid_munged);
-/**
- *      make_kgid - Map a user-namespace gid pair into a kgid.
- *      @ns:  User namespace that the gid is in
- *      @uid: group identifier
- *
- *      Maps a user-namespace gid pair into a kernel internal kgid,
- *      and returns that kgid.
- *
- *      When there is no mapping defined for the user-namespace gid
- *      pair INVALID_GID is returned.  Callers are expected to test
- *      for and handle INVALID_GID being returned.  INVALID_GID may be
- *      tested for using gid_valid().
- */
-kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
-{
-        /* Map the gid to a global kernel gid */
-        return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
-}
-EXPORT_SYMBOL(make_kgid);
-/**
- *      from_kgid - Create a gid from a kgid user-namespace pair.
- *      @targ: The user namespace we want a gid in.
- *      @kgid: The kernel internal gid to start with.
- *
- *      Map @kgid into the user-namespace specified by @targ and
- *      return the resulting gid.
- *
- *      There is always a mapping into the initial user_namespace.
- *
- *      If @kgid has no mapping in @targ (gid_t)-1 is returned.
- */
-gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
-{
-        /* Map the gid from a global kernel gid */
-        return map_id_up(&targ->gid_map, __kgid_val(kgid));
-}
-EXPORT_SYMBOL(from_kgid);
-/**
- *      from_kgid_munged - Create a gid from a kgid user-namespace pair.
- *      @targ: The user namespace we want a gid in.
- *      @kgid: The kernel internal gid to start with.
- *
- *      Map @kgid into the user-namespace specified by @targ and
- *      return the resulting gid.
- *
- *      There is always a mapping into the initial user_namespace.
- *
- *      Unlike from_kgid from_kgid_munged never fails and always
- *      returns a valid gid.  This makes from_kgid_munged appropriate
- *      for use in syscalls like stat and getgid where failing the
- *      system call and failing to provide a valid gid are not options.
- *
- *      If @kgid has no mapping in @targ overflowgid is returned.
- */
-gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
-{
-        gid_t gid;
-        gid = from_kgid(targ, kgid);
-        if (gid == (gid_t) -1)
-                gid = overflowgid;
-        return gid;
-}
-EXPORT_SYMBOL(from_kgid_munged);
-/**
- *      make_kprojid - Map a user-namespace projid pair into a kprojid.
- *      @ns:  User namespace that the projid is in
- *      @projid: Project identifier
- *
- *      Maps a user-namespace uid pair into a kernel internal kuid,
- *      and returns that kuid.
- *
- *      When there is no mapping defined for the user-namespace projid
- *      pair INVALID_PROJID is returned.  Callers are expected to test
- *      for and handle handle INVALID_PROJID being returned.  INVALID_PROJID
- *      may be tested for using projid_valid().
- */
-kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
-{
-        /* Map the uid to a global kernel uid */
-        return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
-}
-EXPORT_SYMBOL(make_kprojid);
-/**
- *      from_kprojid - Create a projid from a kprojid user-namespace pair.
- *      @targ: The user namespace we want a projid in.
- *      @kprojid: The kernel internal project identifier to start with.
- *
- *      Map @kprojid into the user-namespace specified by @targ and
- *      return the resulting projid.
- *
- *      There is always a mapping into the initial user_namespace.
- *
- *      If @kprojid has no mapping in @targ (projid_t)-1 is returned.
- */
-projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
-{
-        /* Map the uid from a global kernel uid */
-        return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
-}
-EXPORT_SYMBOL(from_kprojid);
-/**
- *      from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
- *      @targ: The user namespace we want a projid in.
- *      @kprojid: The kernel internal projid to start with.
- *
- *      Map @kprojid into the user-namespace specified by @targ and
- *      return the resulting projid.
- *
- *      There is always a mapping into the initial user_namespace.
- *
- *      Unlike from_kprojid from_kprojid_munged never fails and always
- *      returns a valid projid.  This makes from_kprojid_munged
- *      appropriate for use in syscalls like stat and where
- *      failing the system call and failing to provide a valid projid are
- *      not an options.
- *
- *      If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
- */
-projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
-{
-        projid_t projid;
-        projid = from_kprojid(targ, kprojid);
-        if (projid == (projid_t) -1)
-                projid = OVERFLOW_PROJID;
-        return projid;
-}
-EXPORT_SYMBOL(from_kprojid_munged);
-static int uid_m_show(struct seq_file *seq, void *v)
-{
-        struct user_namespace *ns = seq->private;
-        struct uid_gid_extent *extent = v;
-        struct user_namespace *lower_ns;
-        uid_t lower;
-        lower_ns = seq_user_ns(seq);
-        if ((lower_ns == ns) && lower_ns->parent)
-                lower_ns = lower_ns->parent;
-        lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));
-        seq_printf(seq, "%10u %10u %10u\n",
-                extent->first,
-                lower,
-                extent->count);
-        return 0;
-}
-static int gid_m_show(struct seq_file *seq, void *v)
-{
-        struct user_namespace *ns = seq->private;
-        struct uid_gid_extent *extent = v;
-        struct user_namespace *lower_ns;
-        gid_t lower;
-        lower_ns = seq_user_ns(seq);
-        if ((lower_ns == ns) && lower_ns->parent)
-                lower_ns = lower_ns->parent;
-        lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));
-        seq_printf(seq, "%10u %10u %10u\n",
-                extent->first,
-                lower,
-                extent->count);
-        return 0;
-}
-static int projid_m_show(struct seq_file *seq, void *v)
+        /* set the new root user in the credentials under preparation */
-{
+        ns->creator = new->user;
-        struct user_namespace *ns = seq->private;
+        new->user = root_user;
-        struct uid_gid_extent *extent = v;
+        new->uid = new->euid = new->suid = new->fsuid = 0;
-        struct user_namespace *lower_ns;
+        new->gid = new->egid = new->sgid = new->fsgid = 0;
-        projid_t lower;
+        put_group_info(new->group_info);
+        new->group_info = get_group_info(&init_groups);
-        lower_ns = seq_user_ns(seq);
+#ifdef CONFIG_KEYS
-        if ((lower_ns == ns) && lower_ns->parent)
+        key_put(new->request_key_auth);
-                lower_ns = lower_ns->parent;
+        new->request_key_auth = NULL;
+#endif
-        lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));
+        /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
-        seq_printf(seq, "%10u %10u %10u\n",
+        /* root_user holds a reference to ns, our reference can be dropped */
-                extent->first,
+        put_user_ns(ns);
-                lower,
-                extent->count);
        return 0;
 }
-static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map)
+/*
-{
+ * Deferred destructor for a user namespace.  This is required because
-        struct uid_gid_extent *extent = NULL;
+ * free_user_ns() may be called with uidhash_lock held, but we need to call
-        loff_t pos = *ppos;
+ * back to free_uid() which will want to take the lock again.
+ */
-        if (pos < map->nr_extents)
+static void free_user_ns_work(struct work_struct *work)
-                extent = &map->extent[pos];
-        return extent;
-}
-static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
-{
-        struct user_namespace *ns = seq->private;
-        return m_start(seq, ppos, &ns->uid_map);
-}
-static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
-{
-        struct user_namespace *ns = seq->private;
-        return m_start(seq, ppos, &ns->gid_map);
-}
-static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
 {
-        struct user_namespace *ns = seq->private;
+        struct user_namespace *ns =
+                container_of(work, struct user_namespace, destroyer);
-        return m_start(seq, ppos, &ns->projid_map);
+        free_uid(ns->creator);
+        kmem_cache_free(user_ns_cachep, ns);
 }
-static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
+void free_user_ns(struct kref *kref)
 {
-        (*pos)++;
+        struct user_namespace *ns =
-        return seq->op->start(seq, pos);
+                container_of(kref, struct user_namespace, kref);
-}
-static void m_stop(struct seq_file *seq, void *v)
+        INIT_WORK(&ns->destroyer, free_user_ns_work);
-{
+        schedule_work(&ns->destroyer);
-        return;
 }
+EXPORT_SYMBOL(free_user_ns);
-struct seq_operations proc_uid_seq_operations = {
+uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
-        .start = uid_m_start,
-        .stop = m_stop,
-        .next = m_next,
-        .show = uid_m_show,
-};
-struct seq_operations proc_gid_seq_operations = {
-        .start = gid_m_start,
-        .stop = m_stop,
-        .next = m_next,
-        .show = gid_m_show,
-};
-struct seq_operations proc_projid_seq_operations = {
-        .start = projid_m_start,
-        .stop = m_stop,
-        .next = m_next,
-        .show = projid_m_show,
-};
-static DEFINE_MUTEX(id_map_mutex);
-static ssize_t map_write(struct file *file, const char __user *buf,
-                         size_t count, loff_t *ppos,
-                         int cap_setid,
-                         struct uid_gid_map *map,
-                         struct uid_gid_map *parent_map)
 {
-        struct seq_file *seq = file->private_data;
+        struct user_namespace *tmp;
-        struct user_namespace *ns = seq->private;
-        struct uid_gid_map new_map;
-        unsigned idx;
-        struct uid_gid_extent *extent, *last = NULL;
-        unsigned long page = 0;
-        char *kbuf, *pos, *next_line;
-        ssize_t ret = -EINVAL;
-        /*
+        if (likely(to == cred->user->user_ns))
-         * The id_map_mutex serializes all writes to any given map.
+                return uid;
-         *
-         * Any map is only ever written once.
-         *
-         * An id map fits within 1 cache line on most architectures.
-         *
-         * On read nothing needs to be done unless you are on an
-         * architecture with a crazy cache coherency model like alpha.
-         *
-         * There is a one time data dependency between reading the
-         * count of the extents and the values of the extents.  The
-         * desired behavior is to see the values of the extents that
-         * were written before the count of the extents.
-         *
-         * To achieve this smp_wmb() is used on guarantee the write
-         * order and smp_read_barrier_depends() is guaranteed that we
-         * don't have crazy architectures returning stale data.
-         *
-         */
-        mutex_lock(&id_map_mutex);
-        ret = -EPERM;
-        /* Only allow one successful write to the map */
-        if (map->nr_extents != 0)
-                goto out;
-        /* Require the appropriate privilege CAP_SETUID or CAP_SETGID
+        /* Is cred->user the creator of the target user_ns
-         * over the user namespace in order to set the id mapping.
+         * or the creator of one of it's parents?
         */
-        if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid))
+        for ( tmp = to; tmp != &init_user_ns;
-                goto out;
+              tmp = tmp->creator->user_ns ) {
+                if (cred->user == tmp->creator) {
-        /* Get a buffer */
+                        return (uid_t)0;
-        ret = -ENOMEM;
-        page = __get_free_page(GFP_TEMPORARY);
-        kbuf = (char *) page;
-        if (!page)
-                goto out;
-        /* Only allow <= page size writes at the beginning of the file */
-        ret = -EINVAL;
-        if ((*ppos != 0) || (count >= PAGE_SIZE))
-                goto out;
-        /* Slurp in the user data */
-        ret = -EFAULT;
-        if (copy_from_user(kbuf, buf, count))
-                goto out;
-        kbuf[count] = '\0';
-        /* Parse the user data */
-        ret = -EINVAL;
-        pos = kbuf;
-        new_map.nr_extents = 0;
-        for (;pos; pos = next_line) {
-                extent = &new_map.extent[new_map.nr_extents];
-                /* Find the end of line and ensure I don't look past it */
-                next_line = strchr(pos, '\n');
-                if (next_line) {
-                        *next_line = '\0';
-                        next_line++;
-                        if (*next_line == '\0')
-                                next_line = NULL;
                }
-                pos = skip_spaces(pos);
-                extent->first = simple_strtoul(pos, &pos, 10);
-                if (!isspace(*pos))
-                        goto out;
-                pos = skip_spaces(pos);
-                extent->lower_first = simple_strtoul(pos, &pos, 10);
-                if (!isspace(*pos))
-                        goto out;
-                pos = skip_spaces(pos);
-                extent->count = simple_strtoul(pos, &pos, 10);
-                if (*pos && !isspace(*pos))
-                        goto out;
-                /* Verify there is not trailing junk on the line */
-                pos = skip_spaces(pos);
-                if (*pos != '\0')
-                        goto out;
-                /* Verify we have been given valid starting values */
-                if ((extent->first == (u32) -1) ||
-                    (extent->lower_first == (u32) -1 ))
-                        goto out;
-                /* Verify count is not zero and does not cause the extent to wrap */
-                if ((extent->first + extent->count) <= extent->first)
-                        goto out;
-                if ((extent->lower_first + extent->count) <= extent->lower_first)
-                        goto out;
-                /* For now only accept extents that are strictly in order */
-                if (last &&
-                    (((last->first + last->count) > extent->first) ||
-                     ((last->lower_first + last->count) > extent->lower_first)))
-                        goto out;
-                new_map.nr_extents++;
-                last = extent;
-                /* Fail if the file contains too many extents */
-                if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
-                    (next_line != NULL))
-                        goto out;
        }
-        /* Be very certaint the new map actually exists */
-        if (new_map.nr_extents == 0)
-                goto out;
-        ret = -EPERM;
-        /* Validate the user is allowed to use user id's mapped to. */
-        if (!new_idmap_permitted(ns, cap_setid, &new_map))
-                goto out;
-        /* Map the lower ids from the parent user namespace to the
-         * kernel global id space.
-         */
-        for (idx = 0; idx < new_map.nr_extents; idx++) {
-                u32 lower_first;
-                extent = &new_map.extent[idx];
-                lower_first = map_id_range_down(parent_map,
+        /* No useful relationship so no mapping */
-                                                extent->lower_first,
+        return overflowuid;
-                                                extent->count);
-                /* Fail if we can not map the specified extent to
-                 * the kernel global id space.
-                 */
-                if (lower_first == (u32) -1)
-                        goto out;
-                extent->lower_first = lower_first;
-        }
-        /* Install the map */
-        memcpy(map->extent, new_map.extent,
-                new_map.nr_extents*sizeof(new_map.extent[0]));
-        smp_wmb();
-        map->nr_extents = new_map.nr_extents;
-        *ppos = count;
-        ret = count;
-out:
-        mutex_unlock(&id_map_mutex);
-        if (page)
-                free_page(page);
-        return ret;
 }
-ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
 {
-        struct seq_file *seq = file->private_data;
+        struct user_namespace *tmp;
-        struct user_namespace *ns = seq->private;
-        struct user_namespace *seq_ns = seq_user_ns(seq);
-        if (!ns->parent)
-                return -EPERM;
-        if ((seq_ns != ns) && (seq_ns != ns->parent))
+        if (likely(to == cred->user->user_ns))
-                return -EPERM;
+                return gid;
-        return map_write(file, buf, size, ppos, CAP_SETUID,
+        /* Is cred->user the creator of the target user_ns
-                         &ns->uid_map, &ns->parent->uid_map);
+         * or the creator of one of it's parents?
-}
+         */
+        for ( tmp = to; tmp != &init_user_ns;
-ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+              tmp = tmp->creator->user_ns ) {
-{
+                if (cred->user == tmp->creator) {
-        struct seq_file *seq = file->private_data;
+                        return (gid_t)0;
-        struct user_namespace *ns = seq->private;
-        struct user_namespace *seq_ns = seq_user_ns(seq);
-        if (!ns->parent)
-                return -EPERM;
-        if ((seq_ns != ns) && (seq_ns != ns->parent))
-                return -EPERM;
-        return map_write(file, buf, size, ppos, CAP_SETGID,
-                         &ns->gid_map, &ns->parent->gid_map);
-}
-ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
-{
-        struct seq_file *seq = file->private_data;
-        struct user_namespace *ns = seq->private;
-        struct user_namespace *seq_ns = seq_user_ns(seq);
-        if (!ns->parent)
-                return -EPERM;
-        if ((seq_ns != ns) && (seq_ns != ns->parent))
-                return -EPERM;
-        /* Anyone can set any valid project id no capability needed */
-        return map_write(file, buf, size, ppos, -1,
-                         &ns->projid_map, &ns->parent->projid_map);
-}
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
-                                struct uid_gid_map *new_map)
-{
-        /* Allow mapping to your own filesystem ids */
-        if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
-                u32 id = new_map->extent[0].lower_first;
-                if (cap_setid == CAP_SETUID) {
-                        kuid_t uid = make_kuid(ns->parent, id);
-                        if (uid_eq(uid, current_fsuid()))
-                                return true;
-                }
-                else if (cap_setid == CAP_SETGID) {
-                        kgid_t gid = make_kgid(ns->parent, id);
-                        if (gid_eq(gid, current_fsgid()))
-                                return true;
                }
        }
-        /* Allow anyone to set a mapping that doesn't require privilege */
+        /* No useful relationship so no mapping */
-        if (!cap_valid(cap_setid))
+        return overflowgid;
-                return true;
-        /* Allow the specified ids if we have the appropriate capability
-         * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
-         */
-        if (ns_capable(ns->parent, cap_setid))
-                return true;
-        return false;
 }
-static void *userns_get(struct task_struct *task)
-{
-        struct user_namespace *user_ns;
-        rcu_read_lock();
-        user_ns = get_user_ns(__task_cred(task)->user_ns);
-        rcu_read_unlock();
-        return user_ns;
-}
-static void userns_put(void *ns)
-{
-        put_user_ns(ns);
-}
-static int userns_install(struct nsproxy *nsproxy, void *ns)
-{
-        struct user_namespace *user_ns = ns;
-        struct cred *cred;
-        /* Don't allow gaining capabilities by reentering
-         * the same user namespace.
-         */
-        if (user_ns == current_user_ns())
-                return -EINVAL;
-        /* Threaded processes may not enter a different user namespace */
-        if (atomic_read(&current->mm->mm_users) > 1)
-                return -EINVAL;
-        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
-                return -EPERM;
-        cred = prepare_creds();
-        if (!cred)
-                return -ENOMEM;
-        put_user_ns(cred->user_ns);
-        set_cred_user_ns(cred, get_user_ns(user_ns));
-        return commit_creds(cred);
-}
-static unsigned int userns_inum(void *ns)
-{
-        struct user_namespace *user_ns = ns;
-        return user_ns->proc_inum;
-}
-const struct proc_ns_operations userns_operations = {
-        .name           = "user",
-        .type           = CLONE_NEWUSER,
-        .get            = userns_get,
-        .put            = userns_put,
-        .install        = userns_install,
-        .inum           = userns_inum,
-};
 static __init int user_namespaces_init(void)
 {
        user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 08b197e8c48..bff131b9510 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -9,7 +9,7 @@
 *  License.
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/err.h>
@@ -32,25 +32,18 @@ static struct uts_namespace *create_uts_ns(void)
 * @old_ns: namespace to clone
 * Return NULL on error (failure to kmalloc), new ns otherwise
 */
-static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
+static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
                                          struct uts_namespace *old_ns)
 {
        struct uts_namespace *ns;
-        int err;
        ns = create_uts_ns();
        if (!ns)
                return ERR_PTR(-ENOMEM);
-        err = proc_alloc_inum(&ns->proc_inum);
-        if (err) {
-                kfree(ns);
-                return ERR_PTR(err);
-        }
        down_read(&uts_sem);
        memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
-        ns->user_ns = get_user_ns(user_ns);
+        ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
        up_read(&uts_sem);
        return ns;
 }
@@ -62,8 +55,9 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
 * versa.
 */
 struct uts_namespace *copy_utsname(unsigned long flags,
-        struct user_namespace *user_ns, struct uts_namespace *old_ns)
+                                   struct task_struct *tsk)
 {
+        struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
        struct uts_namespace *new_ns;
        BUG_ON(!old_ns);
@@ -72,7 +66,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
        if (!(flags & CLONE_NEWUTS))
                return old_ns;
-        new_ns = clone_uts_ns(user_ns, old_ns);
+        new_ns = clone_uts_ns(tsk, old_ns);
        put_uts_ns(old_ns);
        return new_ns;
@@ -84,7 +78,6 @@ void free_uts_ns(struct kref *kref)
        ns = container_of(kref, struct uts_namespace, kref);
        put_user_ns(ns->user_ns);
-        proc_free_inum(ns->proc_inum);
        kfree(ns);
 }
@@ -109,32 +102,19 @@ static void utsns_put(void *ns)
        put_uts_ns(ns);
 }
-static int utsns_install(struct nsproxy *nsproxy, void *new)
+static int utsns_install(struct nsproxy *nsproxy, void *ns)
 {
-        struct uts_namespace *ns = new;
-        if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
-            !nsown_capable(CAP_SYS_ADMIN))
-                return -EPERM;
        get_uts_ns(ns);
        put_uts_ns(nsproxy->uts_ns);
        nsproxy->uts_ns = ns;
        return 0;
 }
-static unsigned int utsns_inum(void *vp)
-{
-        struct uts_namespace *ns = vp;
-        return ns->proc_inum;
-}
 const struct proc_ns_operations utsns_operations = {
        .name           = "uts",
        .type           = CLONE_NEWUTS,
        .get            = utsns_get,
        .put            = utsns_put,
        .install        = utsns_install,
-        .inum           = utsns_inum,
 };
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 63da38c2d82..a2cd77e70d4 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -9,11 +9,10 @@
 *  License.
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
-#include <linux/wait.h>
 static void *get_uts(ctl_table *table, int write)
 {
@@ -52,19 +51,12 @@ static int proc_do_uts_string(ctl_table *table, int write,
        uts_table.data = get_uts(table, write);
        r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
        put_uts(table, write, uts_table.data);
-        if (write)
-                proc_sys_poll_notify(table->poll);
        return r;
 }
 #else
 #define proc_do_uts_string NULL
 #endif
-static DEFINE_CTL_TABLE_POLL(hostname_poll);
-static DEFINE_CTL_TABLE_POLL(domainname_poll);
 static struct ctl_table uts_kern_table[] = {
        {
                .procname       = "ostype",
@@ -93,7 +85,6 @@ static struct ctl_table uts_kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.nodename),
                .mode           = 0644,
                .proc_handler   = proc_do_uts_string,
-                .poll           = &hostname_poll,
        },
        {
                .procname       = "domainname",
@@ -101,7 +92,6 @@ static struct ctl_table uts_kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.domainname),
                .mode           = 0644,
                .proc_handler   = proc_do_uts_string,
-                .poll           = &domainname_poll,
        },
        {}
 };
@@ -115,19 +105,6 @@ static struct ctl_table uts_root_table[] = {
        {}
 };
-#ifdef CONFIG_PROC_SYSCTL
-/*
- * Notify userspace about a change in a certain entry of uts_kern_table,
- * identified by the parameter proc.
- */
-void uts_proc_notify(enum uts_proc proc)
-{
-        struct ctl_table *table = &uts_kern_table[proc];
-        proc_sys_poll_notify(table->poll);
-}
-#endif
 static int __init utsname_sysctl_init(void)
 {
        register_sysctl_table(uts_root_table);
diff --git a/kernel/wait.c b/kernel/wait.c
index 6698e0c04ea..f45ea8d2a1c 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -1,19 +1,19 @@
 /*
 * Generic waiting primitives.
 *
- * (C) 2004 Nadia Yvette Chambers, Oracle
+ * (C) 2004 William Irwin, Oracle
 */
 #include <linux/init.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/wait.h>
 #include <linux/hash.h>
-void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
+void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
 {
        spin_lock_init(&q->lock);
-        lockdep_set_class_and_name(&q->lock, key, name);
+        lockdep_set_class(&q->lock, key);
        INIT_LIST_HEAD(&q->task_list);
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 75a2ab3d0b0..36491cd5b7d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -3,14 +3,15 @@
 *
 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
 *
- * Note: Most of this code is borrowed heavily from the original softlockup
+ * this code detects hard lockups: incidents in where on a CPU
- * detector, so thanks to Ingo for the initial implementation.
+ * the kernel does not respond to anything except NMI.
- * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
+ *
+ * Note: Most of this code is borrowed heavily from softlockup.c,
+ * so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
 * to those contributors as well.
 */
-#define pr_fmt(fmt) "NMI watchdog: " fmt
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/nmi.h>
@@ -22,27 +23,22 @@
 #include <linux/notifier.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
-#include <linux/smpboot.h>
 #include <asm/irq_regs.h>
-#include <linux/kvm_para.h>
 #include <linux/perf_event.h>
 int watchdog_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
-static int __read_mostly watchdog_disabled;
-static u64 __read_mostly sample_period;
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
 static DEFINE_PER_CPU(bool, softlockup_touch_sync);
 static DEFINE_PER_CPU(bool, soft_watchdog_warn);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
-static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
@@ -117,16 +113,15 @@ static unsigned long get_timestamp(int this_cpu)
        return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
 }
-static void set_sample_period(void)
+static unsigned long get_sample_period(void)
 {
        /*
         * convert watchdog_thresh from seconds to ns
-         * the divide by 5 is to give hrtimer several chances (two
+         * the divide by 5 is to give hrtimer 5 chances to
-         * or three with the current relation between the soft
+         * increment before the hardlockup detector generates
-         * and hard thresholds) to increment before the
+         * a warning
-         * hardlockup detector generates a warning
         */
-        sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+        return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
 }
 /* Commands for resetting the watchdog */
@@ -252,15 +247,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
        __this_cpu_write(hard_watchdog_warn, false);
        return;
 }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
 static void watchdog_interrupt_count(void)
 {
        __this_cpu_inc(hrtimer_interrupts);
 }
+#else
-static int watchdog_nmi_enable(unsigned int cpu);
+static inline void watchdog_interrupt_count(void) { return; }
-static void watchdog_nmi_disable(unsigned int cpu);
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
@@ -276,7 +269,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        wake_up_process(__this_cpu_read(softlockup_watchdog));
        /* .. and repeat */
-        hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
+        hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
        if (touch_ts == 0) {
                if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -287,9 +280,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                        __this_cpu_write(softlockup_touch_sync, false);
                        sched_clock_tick();
                }
-                /* Clear the guest paused flag on watchdog reset */
-                kvm_check_and_clear_guest_paused();
                __touch_watchdog();
                return HRTIMER_RESTART;
        }
@@ -302,19 +292,11 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
         */
        duration = is_softlockup(touch_ts);
        if (unlikely(duration)) {
-                /*
-                 * If a virtual machine is stopped by the host it can look to
-                 * the watchdog like a soft lockup, check to see if the host
-                 * stopped the vm before we issue the warning
-                 */
-                if (kvm_check_and_clear_guest_paused())
-                        return HRTIMER_RESTART;
                /* only warn once */
                if (__this_cpu_read(soft_watchdog_warn) == true)
                        return HRTIMER_RESTART;
-                printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+                printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
                        smp_processor_id(), duration,
                        current->comm, task_pid_nr(current));
                print_modules();
@@ -333,78 +315,48 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        return HRTIMER_RESTART;
 }
-static void watchdog_set_prio(unsigned int policy, unsigned int prio)
-{
-        struct sched_param param = { .sched_priority = prio };
-        sched_setscheduler(current, policy, &param);
-}
-static void watchdog_enable(unsigned int cpu)
+/*
+ * The watchdog thread - touches the timestamp.
+ */
+static int watchdog(void *unused)
 {
+        static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
-        /* kick off the timer for the hardlockup detector */
+        sched_setscheduler(current, SCHED_FIFO, &param);
-        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        hrtimer->function = watchdog_timer_fn;
-        if (!watchdog_enabled) {
-                kthread_park(current);
-                return;
-        }
-        /* Enable the perf event */
+        /* initialize timestamp */
-        watchdog_nmi_enable(cpu);
+        __touch_watchdog();
+        /* kick off the timer for the hardlockup detector */
        /* done here because hrtimer_start can only pin to smp_processor_id() */
-        hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+        hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
                      HRTIMER_MODE_REL_PINNED);
-        /* initialize timestamp */
+        set_current_state(TASK_INTERRUPTIBLE);
-        watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
+        /*
-        __touch_watchdog();
+         * Run briefly once per second to reset the softlockup timestamp.
-}
+         * If this gets delayed for more than 60 seconds then the
+         * debug-printout triggers in watchdog_timer_fn().
+         */
+        while (!kthread_should_stop()) {
+                __touch_watchdog();
+                schedule();
-static void watchdog_disable(unsigned int cpu)
+                if (kthread_should_stop())
-{
+                        break;
-        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
-        watchdog_set_prio(SCHED_NORMAL, 0);
+                set_current_state(TASK_INTERRUPTIBLE);
-        hrtimer_cancel(hrtimer);
+        }
-        /* disable the perf event */
+        __set_current_state(TASK_RUNNING);
-        watchdog_nmi_disable(cpu);
-}
-static int watchdog_should_run(unsigned int cpu)
+        return 0;
-{
-        return __this_cpu_read(hrtimer_interrupts) !=
-                __this_cpu_read(soft_lockup_hrtimer_cnt);
 }
-/*
- * The watchdog thread function - touches the timestamp.
- *
- * It only runs once every sample_period seconds (4 seconds by
- * default) to reset the softlockup timestamp. If this gets delayed
- * for more than 2*watchdog_thresh seconds then the debug-printout
- * triggers in watchdog_timer_fn().
- */
-static void watchdog(unsigned int cpu)
-{
-        __this_cpu_write(soft_lockup_hrtimer_cnt,
-                         __this_cpu_read(hrtimer_interrupts));
-        __touch_watchdog();
-}
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-/*
+static int watchdog_nmi_enable(int cpu)
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long cpu0_err;
-static int watchdog_nmi_enable(unsigned int cpu)
 {
        struct perf_event_attr *wd_attr;
        struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -422,31 +374,19 @@ static int watchdog_nmi_enable(unsigned int cpu)
        /* Try to register using hardware perf events */
        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-        /* save cpu0 error for future comparision */
-        if (cpu == 0 && IS_ERR(event))
-                cpu0_err = PTR_ERR(event);
        if (!IS_ERR(event)) {
-                /* only print for cpu0 or different than cpu0 */
+                printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
-                if (cpu == 0 || cpu0_err)
-                        pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
                goto out_save;
        }
-        /* skip displaying the same error again */
-        if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
-                return PTR_ERR(event);
        /* vary the KERN level based on the returned errno */
        if (PTR_ERR(event) == -EOPNOTSUPP)
-                pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+                printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
        else if (PTR_ERR(event) == -ENOENT)
-                pr_warning("disabled (cpu%i): hardware events not enabled\n",
+                printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
-                         cpu);
        else
-                pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+                printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
-                        cpu, PTR_ERR(event));
        return PTR_ERR(event);
        /* success path */
@@ -458,7 +398,7 @@ out:
        return 0;
 }
-static void watchdog_nmi_disable(unsigned int cpu)
+static void watchdog_nmi_disable(int cpu)
 {
        struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -472,35 +412,105 @@ static void watchdog_nmi_disable(unsigned int cpu)
        return;
 }
 #else
-static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
+static int watchdog_nmi_enable(int cpu) { return 0; }
-static void watchdog_nmi_disable(unsigned int cpu) { return; }
+static void watchdog_nmi_disable(int cpu) { return; }
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 /* prepare/enable/disable routines */
-/* sysctl functions */
+static void watchdog_prepare_cpu(int cpu)
-#ifdef CONFIG_SYSCTL
-static void watchdog_enable_all_cpus(void)
 {
-        unsigned int cpu;
+        struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-        if (watchdog_disabled) {
+        WARN_ON(per_cpu(softlockup_watchdog, cpu));
-                watchdog_disabled = 0;
+        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-                for_each_online_cpu(cpu)
+        hrtimer->function = watchdog_timer_fn;
-                        kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+}
+static int watchdog_enable(int cpu)
+{
+        struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+        int err = 0;
+        /* enable the perf event */
+        err = watchdog_nmi_enable(cpu);
+        /* Regardless of err above, fall through and start softlockup */
+        /* create the watchdog thread */
+        if (!p) {
+                p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
+                if (IS_ERR(p)) {
+                        printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
+                        if (!err) {
+                                /* if hardlockup hasn't already set this */
+                                err = PTR_ERR(p);
+                                /* and disable the perf event */
+                                watchdog_nmi_disable(cpu);
+                        }
+                        goto out;
+                }
+                kthread_bind(p, cpu);
+                per_cpu(watchdog_touch_ts, cpu) = 0;
+                per_cpu(softlockup_watchdog, cpu) = p;
+                wake_up_process(p);
        }
+out:
+        return err;
 }
-static void watchdog_disable_all_cpus(void)
+static void watchdog_disable(int cpu)
 {
-        unsigned int cpu;
+        struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+        struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
+        /*
+         * cancel the timer first to stop incrementing the stats
+         * and waking up the kthread
+         */
+        hrtimer_cancel(hrtimer);
+        /* disable the perf event */
+        watchdog_nmi_disable(cpu);
-        if (!watchdog_disabled) {
+        /* stop the watchdog thread */
-                watchdog_disabled = 1;
+        if (p) {
-                for_each_online_cpu(cpu)
+                per_cpu(softlockup_watchdog, cpu) = NULL;
-                        kthread_park(per_cpu(softlockup_watchdog, cpu));
+                kthread_stop(p);
        }
 }
+static void watchdog_enable_all_cpus(void)
+{
+        int cpu;
+        watchdog_enabled = 0;
+        for_each_online_cpu(cpu)
+                if (!watchdog_enable(cpu))
+                        /* if any cpu succeeds, watchdog is considered
+                           enabled for the system */
+                        watchdog_enabled = 1;
+        if (!watchdog_enabled)
+                printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
+}
+static void watchdog_disable_all_cpus(void)
+{
+        int cpu;
+        for_each_online_cpu(cpu)
+                watchdog_disable(cpu);
+        /* if all watchdogs are disabled, then they are disabled for the system */
+        watchdog_enabled = 0;
+}
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
 /*
 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
 */
@@ -510,38 +520,73 @@ int proc_dowatchdog(struct ctl_table *table, int write,
 {
        int ret;
-        if (watchdog_disabled < 0)
-                return -ENODEV;
        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret || !write)
-                return ret;
+                goto out;
-        set_sample_period();
        if (watchdog_enabled && watchdog_thresh)
                watchdog_enable_all_cpus();
        else
                watchdog_disable_all_cpus();
+out:
        return ret;
 }
 #endif /* CONFIG_SYSCTL */
-static struct smp_hotplug_thread watchdog_threads = {
-        .store                  = &softlockup_watchdog,
+/*
-        .thread_should_run      = watchdog_should_run,
+ * Create/destroy watchdog threads as CPUs come and go:
-        .thread_fn              = watchdog,
+ */
-        .thread_comm            = "watchdog/%u",
+static int __cpuinit
-        .setup                  = watchdog_enable,
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-        .park                   = watchdog_disable,
+{
-        .unpark                 = watchdog_enable,
+        int hotcpu = (unsigned long)hcpu;
+        switch (action) {
+        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
+                watchdog_prepare_cpu(hotcpu);
+                break;
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                if (watchdog_enabled)
+                        watchdog_enable(hotcpu);
+                break;
+#ifdef CONFIG_HOTPLUG_CPU
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
+                watchdog_disable(hotcpu);
+                break;
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                watchdog_disable(hotcpu);
+                break;
+#endif /* CONFIG_HOTPLUG_CPU */
+        }
+        /*
+         * hardlockup and softlockup are not important enough
+         * to block cpu bring up.  Just always succeed and
+         * rely on printk output to flag problems.
+         */
+        return NOTIFY_OK;
+}
+static struct notifier_block __cpuinitdata cpu_nfb = {
+        .notifier_call = cpu_callback
 };
 void __init lockup_detector_init(void)
 {
-        set_sample_period();
+        void *cpu = (void *)(long)smp_processor_id();
-        if (smpboot_register_percpu_thread(&watchdog_threads)) {
+        int err;
-                pr_err("Failed to create watchdog threads, disabled\n");
-                watchdog_disabled = -ENODEV;
+        err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-        }
+        WARN_ON(notifier_to_errno(err));
+        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+        register_cpu_notifier(&cpu_nfb);
+        return;
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fbc6576a83c..1783aabc612 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -23,7 +23,7 @@
 * Please read Documentation/workqueue.txt for details.
 */
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/init.h>
@@ -45,41 +45,32 @@
 #include "workqueue_sched.h"
 enum {
-        /*
+        /* global_cwq flags */
-         * global_cwq flags
+        GCWQ_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
-         *
+        GCWQ_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
-         * A bound gcwq is either associated or disassociated with its CPU.
+        GCWQ_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
-         * While associated (!DISASSOCIATED), all workers are bound to the
+        GCWQ_FREEZING           = 1 << 3,       /* freeze in progress */
-         * CPU and none has %WORKER_UNBOUND set and concurrency management
+        GCWQ_HIGHPRI_PENDING    = 1 << 4,       /* highpri works on queue */
-         * is in effect.
-         *
-         * While DISASSOCIATED, the cpu may be offline and all workers have
-         * %WORKER_UNBOUND set and concurrency management disabled, and may
-         * be executing on any CPU.  The gcwq behaves as an unbound one.
-         *
-         * Note that DISASSOCIATED can be flipped only while holding
-         * assoc_mutex of all pools on the gcwq to avoid changing binding
-         * state while create_worker() is in progress.
-         */
-        GCWQ_DISASSOCIATED      = 1 << 0,       /* cpu can't serve workers */
-        GCWQ_FREEZING           = 1 << 1,       /* freeze in progress */
-        /* pool flags */
-        POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
-        POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
        /* worker flags */
        WORKER_STARTED          = 1 << 0,       /* started */
        WORKER_DIE              = 1 << 1,       /* die die die */
        WORKER_IDLE             = 1 << 2,       /* is idle */
        WORKER_PREP             = 1 << 3,       /* preparing to run works */
+        WORKER_ROGUE            = 1 << 4,       /* not bound to any cpu */
+        WORKER_REBIND           = 1 << 5,       /* mom is home, come back */
        WORKER_CPU_INTENSIVE    = 1 << 6,       /* cpu intensive */
        WORKER_UNBOUND          = 1 << 7,       /* worker is unbound */
-        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_UNBOUND |
+        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
-                                  WORKER_CPU_INTENSIVE,
+                                  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
-        NR_WORKER_POOLS         = 2,            /* # worker pools per gcwq */
+        /* gcwq->trustee_state */
+        TRUSTEE_START           = 0,            /* start */
+        TRUSTEE_IN_CHARGE       = 1,            /* trustee in charge of gcwq */
+        TRUSTEE_BUTCHER         = 2,            /* butcher workers */
+        TRUSTEE_RELEASE         = 3,            /* release workers */
+        TRUSTEE_DONE            = 4,            /* trustee is done */
        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
        BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
@@ -93,13 +84,13 @@ enum {
                                                   (min two ticks) */
        MAYDAY_INTERVAL         = HZ / 10,      /* and then every 100ms */
        CREATE_COOLDOWN         = HZ,           /* time to breath after fail */
+        TRUSTEE_COOLDOWN        = HZ / 10,      /* for trustee draining */
        /*
         * Rescue workers are used only on emergencies and shared by
         * all cpus.  Give -20.
         */
        RESCUER_NICE_LEVEL      = -20,
-        HIGHPRI_NICE_LEVEL      = -20,
 };
 /*
@@ -124,7 +115,6 @@ enum {
 */
 struct global_cwq;
-struct worker_pool;
 /*
 * The poor guys doing the actual heavy lifting.  All on-duty workers
@@ -141,32 +131,12 @@ struct worker {
        struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
        struct list_head        scheduled;      /* L: scheduled works */
        struct task_struct      *task;          /* I: worker task */
-        struct worker_pool      *pool;          /* I: the associated pool */
+        struct global_cwq       *gcwq;          /* I: the associated gcwq */
        /* 64 bytes boundary on 64bit, 32 on 32bit */
        unsigned long           last_active;    /* L: last active timestamp */
        unsigned int            flags;          /* X: flags */
        int                     id;             /* I: worker id */
+        struct work_struct      rebind_work;    /* L: rebind worker to cpu */
-        /* for rebinding worker to CPU */
-        struct work_struct      rebind_work;    /* L: for busy worker */
-};
-struct worker_pool {
-        struct global_cwq       *gcwq;          /* I: the owning gcwq */
-        unsigned int            flags;          /* X: flags */
-        struct list_head        worklist;       /* L: list of pending works */
-        int                     nr_workers;     /* L: total number of workers */
-        /* nr_idle includes the ones off idle_list for rebinding */
-        int                     nr_idle;        /* L: currently idle ones */
-        struct list_head        idle_list;      /* X: list of idle workers */
-        struct timer_list       idle_timer;     /* L: worker idle timeout */
-        struct timer_list       mayday_timer;   /* L: SOS timer for workers */
-        struct mutex            assoc_mutex;    /* protect GCWQ_DISASSOCIATED */
-        struct ida              worker_ida;     /* L: for worker IDs */
 };
 /*
@@ -176,15 +146,27 @@ struct worker_pool {
 */
 struct global_cwq {
        spinlock_t              lock;           /* the gcwq lock */
+        struct list_head        worklist;       /* L: list of pending works */
        unsigned int            cpu;            /* I: the associated cpu */
        unsigned int            flags;          /* L: GCWQ_* flags */
-        /* workers are chained either in busy_hash or pool idle_list */
+        int                     nr_workers;     /* L: total number of workers */
+        int                     nr_idle;        /* L: currently idle ones */
+        /* workers are chained either in the idle_list or busy_hash */
+        struct list_head        idle_list;      /* X: list of idle workers */
        struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
                                                /* L: hash of busy workers */
-        struct worker_pool      pools[NR_WORKER_POOLS];
+        struct timer_list       idle_timer;     /* L: worker idle timeout */
-                                                /* normal and highpri pools */
+        struct timer_list       mayday_timer;   /* L: SOS timer for dworkers */
+        struct ida              worker_ida;     /* L: for worker IDs */
+        struct task_struct      *trustee;       /* L: for gcwq shutdown */
+        unsigned int            trustee_state;  /* L: trustee state */
+        wait_queue_head_t       trustee_wait;   /* trustee wait */
+        struct worker           *first_idle;    /* L: first idle worker */
 } ____cacheline_aligned_in_smp;
 /*
@@ -193,7 +175,7 @@ struct global_cwq {
 * aligned at two's power of the number of flag bits.
 */
 struct cpu_workqueue_struct {
-        struct worker_pool      *pool;          /* I: the associated pool */
+        struct global_cwq       *gcwq;          /* I: the associated gcwq */
        struct workqueue_struct *wq;            /* I: the owning workqueue */
        int                     work_color;     /* L: current color */
        int                     flush_color;    /* L: flushing color */
@@ -260,30 +242,26 @@ struct workqueue_struct {
        int                     nr_drainers;    /* W: drain in progress */
        int                     saved_max_active; /* W: saved cwq max_active */
+        const char              *name;          /* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
        struct lockdep_map      lockdep_map;
 #endif
-        char                    name[];         /* I: workqueue name */
 };
 struct workqueue_struct *system_wq __read_mostly;
-EXPORT_SYMBOL_GPL(system_wq);
-struct workqueue_struct *system_highpri_wq __read_mostly;
-EXPORT_SYMBOL_GPL(system_highpri_wq);
 struct workqueue_struct *system_long_wq __read_mostly;
-EXPORT_SYMBOL_GPL(system_long_wq);
+struct workqueue_struct *system_nrt_wq __read_mostly;
 struct workqueue_struct *system_unbound_wq __read_mostly;
-EXPORT_SYMBOL_GPL(system_unbound_wq);
 struct workqueue_struct *system_freezable_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_wq);
+EXPORT_SYMBOL_GPL(system_long_wq);
+EXPORT_SYMBOL_GPL(system_nrt_wq);
+EXPORT_SYMBOL_GPL(system_unbound_wq);
 EXPORT_SYMBOL_GPL(system_freezable_wq);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
-#define for_each_worker_pool(pool, gcwq)                                \
-        for ((pool) = &(gcwq)->pools[0];                                \
-             (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
 #define for_each_busy_worker(worker, i, pos, gcwq)                      \
        for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
                hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -464,7 +442,7 @@ static bool workqueue_freezing;		/* W: have wqs started freezing? */
 * try_to_wake_up().  Put it in a separate cacheline.
 */
 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
-static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
 /*
 * Global cpu workqueue and nr_running counter for unbound gcwq.  The
@@ -472,17 +450,10 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS])
 * workers have WORKER_UNBOUND set.
 */
 static struct global_cwq unbound_global_cwq;
-static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
+static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);       /* always 0 */
-        [0 ... NR_WORKER_POOLS - 1]     = ATOMIC_INIT(0),       /* always 0 */
-};
 static int worker_thread(void *__worker);
-static int worker_pool_pri(struct worker_pool *pool)
-{
-        return pool - pool->gcwq->pools;
-}
 static struct global_cwq *get_gcwq(unsigned int cpu)
 {
        if (cpu != WORK_CPU_UNBOUND)
@@ -491,23 +462,25 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
                return &unbound_global_cwq;
 }
-static atomic_t *get_pool_nr_running(struct worker_pool *pool)
+static atomic_t *get_gcwq_nr_running(unsigned int cpu)
 {
-        int cpu = pool->gcwq->cpu;
-        int idx = worker_pool_pri(pool);
        if (cpu != WORK_CPU_UNBOUND)
-                return &per_cpu(pool_nr_running, cpu)[idx];
+                return &per_cpu(gcwq_nr_running, cpu);
        else
-                return &unbound_pool_nr_running[idx];
+                return &unbound_gcwq_nr_running;
 }
 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
                                            struct workqueue_struct *wq)
 {
        if (!(wq->flags & WQ_UNBOUND)) {
-                if (likely(cpu < nr_cpu_ids))
+                if (likely(cpu < nr_cpu_ids)) {
+#ifdef CONFIG_SMP
                        return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
+#else
+                        return wq->cpu_wq.single;
+#endif
+                }
        } else if (likely(cpu == WORK_CPU_UNBOUND))
                return wq->cpu_wq.single;
        return NULL;
@@ -530,24 +503,18 @@ static int work_next_color(int color)
 }
 /*
- * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data
+ * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
- * contain the pointer to the queued cwq.  Once execution starts, the flag
+ * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
- * is cleared and the high bits contain OFFQ flags and CPU number.
+ * cleared and the work data contains the cpu number it was last on.
- *
- * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling()
- * and clear_work_data() can be used to set the cwq, cpu or clear
- * work->data.  These functions should only be called while the work is
- * owned - ie. while the PENDING bit is set.
 *
- * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to
+ * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
- * a work.  gcwq is available once the work has been queued anywhere after
+ * cwq, cpu or clear work->data.  These functions should only be
- * initialization until it is sync canceled.  cwq is available only while
+ * called while the work is owned - ie. while the PENDING bit is set.
- * the work item is queued.
 *
- * %WORK_OFFQ_CANCELING is used to mark a work item which is being
+ * get_work_[g]cwq() can be used to obtain the gcwq or cwq
- * canceled.  While being canceled, a work item may have its PENDING set
+ * corresponding to a work.  gcwq is available once the work has been
- * but stay off timer and worklist for arbitrarily long and nobody should
+ * queued anywhere after initialization.  cwq is available only from
- * try to steal the PENDING bit.
+ * queueing until execution starts.
 */
 static inline void set_work_data(struct work_struct *work, unsigned long data,
                                 unsigned long flags)
@@ -564,22 +531,13 @@ static void set_work_cwq(struct work_struct *work,
                      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
 }
-static void set_work_cpu_and_clear_pending(struct work_struct *work,
+static void set_work_cpu(struct work_struct *work, unsigned int cpu)
-                                           unsigned int cpu)
 {
-        /*
+        set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
-         * The following wmb is paired with the implied mb in
-         * test_and_set_bit(PENDING) and ensures all updates to @work made
-         * here are visible to and precede any updates by the next PENDING
-         * owner.
-         */
-        smp_wmb();
-        set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
 }
 static void clear_work_data(struct work_struct *work)
 {
-        smp_wmb();      /* see set_work_cpu_and_clear_pending() */
        set_work_data(work, WORK_STRUCT_NO_CPU, 0);
 }
@@ -600,9 +558,9 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
        if (data & WORK_STRUCT_CWQ)
                return ((struct cpu_workqueue_struct *)
-                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
+                        (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
-        cpu = data >> WORK_OFFQ_CPU_SHIFT;
+        cpu = data >> WORK_STRUCT_FLAG_BITS;
        if (cpu == WORK_CPU_NONE)
                return NULL;
@@ -610,86 +568,61 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
        return get_gcwq(cpu);
 }
-static void mark_work_canceling(struct work_struct *work)
-{
-        struct global_cwq *gcwq = get_work_gcwq(work);
-        unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
-        set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING,
-                      WORK_STRUCT_PENDING);
-}
-static bool work_is_canceling(struct work_struct *work)
-{
-        unsigned long data = atomic_long_read(&work->data);
-        return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING);
-}
 /*
- * Policy functions.  These define the policies on how the global worker
+ * Policy functions.  These define the policies on how the global
- * pools are managed.  Unless noted otherwise, these functions assume that
+ * worker pool is managed.  Unless noted otherwise, these functions
- * they're being called with gcwq->lock held.
+ * assume that they're being called with gcwq->lock held.
 */
-static bool __need_more_worker(struct worker_pool *pool)
+static bool __need_more_worker(struct global_cwq *gcwq)
 {
-        return !atomic_read(get_pool_nr_running(pool));
+        return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
+                gcwq->flags & GCWQ_HIGHPRI_PENDING;
 }
 /*
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
- *
- * Note that, because unbound workers never contribute to nr_running, this
- * function will always return %true for unbound gcwq as long as the
- * worklist isn't empty.
 */
-static bool need_more_worker(struct worker_pool *pool)
+static bool need_more_worker(struct global_cwq *gcwq)
 {
-        return !list_empty(&pool->worklist) && __need_more_worker(pool);
+        return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
 }
 /* Can I start working?  Called from busy but !running workers. */
-static bool may_start_working(struct worker_pool *pool)
+static bool may_start_working(struct global_cwq *gcwq)
 {
-        return pool->nr_idle;
+        return gcwq->nr_idle;
 }
 /* Do I need to keep working?  Called from currently running workers. */
-static bool keep_working(struct worker_pool *pool)
+static bool keep_working(struct global_cwq *gcwq)
 {
-        atomic_t *nr_running = get_pool_nr_running(pool);
+        atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
-        return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
+        return !list_empty(&gcwq->worklist) &&
+                (atomic_read(nr_running) <= 1 ||
+                 gcwq->flags & GCWQ_HIGHPRI_PENDING);
 }
 /* Do we need a new worker?  Called from manager. */
-static bool need_to_create_worker(struct worker_pool *pool)
+static bool need_to_create_worker(struct global_cwq *gcwq)
 {
-        return need_more_worker(pool) && !may_start_working(pool);
+        return need_more_worker(gcwq) && !may_start_working(gcwq);
 }
 /* Do I need to be the manager? */
-static bool need_to_manage_workers(struct worker_pool *pool)
+static bool need_to_manage_workers(struct global_cwq *gcwq)
 {
-        return need_to_create_worker(pool) ||
+        return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
-                (pool->flags & POOL_MANAGE_WORKERS);
 }
 /* Do we have too many workers and should some go away? */
-static bool too_many_workers(struct worker_pool *pool)
+static bool too_many_workers(struct global_cwq *gcwq)
 {
-        bool managing = pool->flags & POOL_MANAGING_WORKERS;
+        bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
-        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
+        int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
-        int nr_busy = pool->nr_workers - nr_idle;
+        int nr_busy = gcwq->nr_workers - nr_idle;
-        /*
-         * nr_idle and idle_list may disagree if idle rebinding is in
-         * progress.  Never return %true if idle_list is empty.
-         */
-        if (list_empty(&pool->idle_list))
-                return false;
        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
@@ -699,26 +632,26 @@ static bool too_many_workers(struct worker_pool *pool)
 */
 /* Return the first worker.  Safe with preemption disabled */
-static struct worker *first_worker(struct worker_pool *pool)
+static struct worker *first_worker(struct global_cwq *gcwq)
 {
-        if (unlikely(list_empty(&pool->idle_list)))
+        if (unlikely(list_empty(&gcwq->idle_list)))
                return NULL;
-        return list_first_entry(&pool->idle_list, struct worker, entry);
+        return list_first_entry(&gcwq->idle_list, struct worker, entry);
 }
 /**
 * wake_up_worker - wake up an idle worker
- * @pool: worker pool to wake worker from
+ * @gcwq: gcwq to wake worker for
 *
- * Wake up the first idle worker of @pool.
+ * Wake up the first idle worker of @gcwq.
 *
 * CONTEXT:
 * spin_lock_irq(gcwq->lock).
 */
-static void wake_up_worker(struct worker_pool *pool)
+static void wake_up_worker(struct global_cwq *gcwq)
 {
-        struct worker *worker = first_worker(pool);
+        struct worker *worker = first_worker(gcwq);
        if (likely(worker))
                wake_up_process(worker->task);
@@ -739,10 +672,8 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
 {
        struct worker *worker = kthread_data(task);
-        if (!(worker->flags & WORKER_NOT_RUNNING)) {
+        if (!(worker->flags & WORKER_NOT_RUNNING))
-                WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu);
+                atomic_inc(get_gcwq_nr_running(cpu));
-                atomic_inc(get_pool_nr_running(worker->pool));
-        }
 }
 /**
@@ -764,8 +695,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
                                       unsigned int cpu)
 {
        struct worker *worker = kthread_data(task), *to_wakeup = NULL;
-        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = get_gcwq(cpu);
-        atomic_t *nr_running = get_pool_nr_running(pool);
+        atomic_t *nr_running = get_gcwq_nr_running(cpu);
        if (worker->flags & WORKER_NOT_RUNNING)
                return NULL;
@@ -778,14 +709,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
         * worklist not empty test sequence is in insert_work().
         * Please read comment there.
         *
-         * NOT_RUNNING is clear.  This means that we're bound to and
+         * NOT_RUNNING is clear.  This means that trustee is not in
-         * running on the local cpu w/ rq lock held and preemption
+         * charge and we're running on the local cpu w/ rq lock held
-         * disabled, which in turn means that none else could be
+         * and preemption disabled, which in turn means that none else
-         * manipulating idle_list, so dereferencing idle_list without gcwq
+         * could be manipulating idle_list, so dereferencing idle_list
-         * lock is safe.
+         * without gcwq lock is safe.
         */
-        if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
+        if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
-                to_wakeup = first_worker(pool);
+                to_wakeup = first_worker(gcwq);
        return to_wakeup ? to_wakeup->task : NULL;
 }
@@ -805,7 +736,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
                                    bool wakeup)
 {
-        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = worker->gcwq;
        WARN_ON_ONCE(worker->task != current);
@@ -816,12 +747,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
         */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
-                atomic_t *nr_running = get_pool_nr_running(pool);
+                atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
                if (wakeup) {
                        if (atomic_dec_and_test(nr_running) &&
-                            !list_empty(&pool->worklist))
+                            !list_empty(&gcwq->worklist))
-                                wake_up_worker(pool);
+                                wake_up_worker(gcwq);
                } else
                        atomic_dec(nr_running);
        }
@@ -841,7 +772,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
-        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = worker->gcwq;
        unsigned int oflags = worker->flags;
        WARN_ON_ONCE(worker->task != current);
@@ -855,7 +786,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
-                        atomic_inc(get_pool_nr_running(pool));
+                        atomic_inc(get_gcwq_nr_running(gcwq->cpu));
 }
 /**
@@ -939,203 +870,40 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 }
 /**
- * move_linked_works - move linked works to a list
+ * gcwq_determine_ins_pos - find insertion position
- * @work: start of series of works to be scheduled
+ * @gcwq: gcwq of interest
- * @head: target list to append @work to
+ * @cwq: cwq a work is being queued for
- * @nextp: out paramter for nested worklist walking
- *
- * Schedule linked works starting from @work to @head.  Work series to
- * be scheduled starts at @work and includes any consecutive work with
- * WORK_STRUCT_LINKED set in its predecessor.
- *
- * If @nextp is not NULL, it's updated to point to the next work of
- * the last scheduled work.  This allows move_linked_works() to be
- * nested inside outer list_for_each_entry_safe().
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void move_linked_works(struct work_struct *work, struct list_head *head,
-                              struct work_struct **nextp)
-{
-        struct work_struct *n;
-        /*
-         * Linked worklist will always end before the end of the list,
-         * use NULL for list head.
-         */
-        list_for_each_entry_safe_from(work, n, NULL, entry) {
-                list_move_tail(&work->entry, head);
-                if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
-                        break;
-        }
-        /*
-         * If we're already inside safe list traversal and have moved
-         * multiple works to the scheduled queue, the next position
-         * needs to be updated.
-         */
-        if (nextp)
-                *nextp = n;
-}
-static void cwq_activate_delayed_work(struct work_struct *work)
-{
-        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
-        trace_workqueue_activate_work(work);
-        move_linked_works(work, &cwq->pool->worklist, NULL);
-        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
-        cwq->nr_active++;
-}
-static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
-{
-        struct work_struct *work = list_first_entry(&cwq->delayed_works,
-                                                    struct work_struct, entry);
-        cwq_activate_delayed_work(work);
-}
-/**
- * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
- * @cwq: cwq of interest
- * @color: color of work which left the queue
 *
- * A work either has completed or is removed from pending queue,
+ * A work for @cwq is about to be queued on @gcwq, determine insertion
- * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ * position for the work.  If @cwq is for HIGHPRI wq, the work is
+ * queued at the head of the queue but in FIFO order with respect to
+ * other HIGHPRI works; otherwise, at the end of the queue.  This
+ * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
+ * there are HIGHPRI works pending.
 *
 * CONTEXT:
 * spin_lock_irq(gcwq->lock).
- */
-static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
-{
-        /* ignore uncolored works */
-        if (color == WORK_NO_COLOR)
-                return;
-        cwq->nr_in_flight[color]--;
-        cwq->nr_active--;
-        if (!list_empty(&cwq->delayed_works)) {
-                /* one down, submit a delayed one */
-                if (cwq->nr_active < cwq->max_active)
-                        cwq_activate_first_delayed(cwq);
-        }
-        /* is flush in progress and are we at the flushing tip? */
-        if (likely(cwq->flush_color != color))
-                return;
-        /* are there still in-flight works? */
-        if (cwq->nr_in_flight[color])
-                return;
-        /* this cwq is done, clear flush_color */
-        cwq->flush_color = -1;
-        /*
-         * If this was the last cwq, wake up the first flusher.  It
-         * will handle the rest.
-         */
-        if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
-                complete(&cwq->wq->first_flusher->done);
-}
-/**
- * try_to_grab_pending - steal work item from worklist and disable irq
- * @work: work item to steal
- * @is_dwork: @work is a delayed_work
- * @flags: place to store irq state
- *
- * Try to grab PENDING bit of @work.  This function can handle @work in any
- * stable state - idle, on timer or on worklist.  Return values are
- *
- *  1           if @work was pending and we successfully stole PENDING
- *  0           if @work was idle and we claimed PENDING
- *  -EAGAIN     if PENDING couldn't be grabbed at the moment, safe to busy-retry
- *  -ENOENT     if someone else is canceling @work, this state may persist
- *              for arbitrarily long
 *
- * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
+ * RETURNS:
- * interrupted while holding PENDING and @work off queue, irq must be
+ * Pointer to inserstion position.
- * disabled on entry.  This, combined with delayed_work->timer being
- * irqsafe, ensures that we return -EAGAIN for finite short period of time.
- *
- * On successful return, >= 0, irq is disabled and the caller is
- * responsible for releasing it using local_irq_restore(*@flags).
- *
- * This function is safe to call from any context including IRQ handler.
 */
-static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
+static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
-                               unsigned long *flags)
+                                               struct cpu_workqueue_struct *cwq)
 {
-        struct global_cwq *gcwq;
+        struct work_struct *twork;
-        local_irq_save(*flags);
+        if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
+                return &gcwq->worklist;
-        /* try to steal the timer if it exists */
+        list_for_each_entry(twork, &gcwq->worklist, entry) {
-        if (is_dwork) {
+                struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
-                struct delayed_work *dwork = to_delayed_work(work);
-                /*
+                if (!(tcwq->wq->flags & WQ_HIGHPRI))
-                 * dwork->timer is irqsafe.  If del_timer() fails, it's
+                        break;
-                 * guaranteed that the timer is not queued anywhere and not
-                 * running on the local CPU.
-                 */
-                if (likely(del_timer(&dwork->timer)))
-                        return 1;
        }
-        /* try to claim PENDING the normal way */
+        gcwq->flags |= GCWQ_HIGHPRI_PENDING;
-        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
+        return &twork->entry;
-                return 0;
-        /*
-         * The queueing is in progress, or it is already queued. Try to
-         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
-         */
-        gcwq = get_work_gcwq(work);
-        if (!gcwq)
-                goto fail;
-        spin_lock(&gcwq->lock);
-        if (!list_empty(&work->entry)) {
-                /*
-                 * This work is queued, but perhaps we locked the wrong gcwq.
-                 * In that case we must see the new value after rmb(), see
-                 * insert_work()->wmb().
-                 */
-                smp_rmb();
-                if (gcwq == get_work_gcwq(work)) {
-                        debug_work_deactivate(work);
-                        /*
-                         * A delayed work item cannot be grabbed directly
-                         * because it might have linked NO_COLOR work items
-                         * which, if left on the delayed_list, will confuse
-                         * cwq->nr_active management later on and cause
-                         * stall.  Make sure the work item is activated
-                         * before grabbing.
-                         */
-                        if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
-                                cwq_activate_delayed_work(work);
-                        list_del_init(&work->entry);
-                        cwq_dec_nr_in_flight(get_work_cwq(work),
-                                get_work_color(work));
-                        spin_unlock(&gcwq->lock);
-                        return 1;
-                }
-        }
-        spin_unlock(&gcwq->lock);
-fail:
-        local_irq_restore(*flags);
-        if (work_is_canceling(work))
-                return -ENOENT;
-        cpu_relax();
-        return -EAGAIN;
 }
 /**
@@ -1155,7 +923,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
                        struct work_struct *work, struct list_head *head,
                        unsigned int extra_flags)
 {
-        struct worker_pool *pool = cwq->pool;
+        struct global_cwq *gcwq = cwq->gcwq;
        /* we own @work, set data and link */
        set_work_cwq(work, cwq, extra_flags);
@@ -1175,8 +943,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
         */
        smp_mb();
-        if (__need_more_worker(pool))
+        if (__need_more_worker(gcwq))
-                wake_up_worker(pool);
+                wake_up_worker(gcwq);
 }
 /*
@@ -1218,15 +986,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        struct cpu_workqueue_struct *cwq;
        struct list_head *worklist;
        unsigned int work_flags;
-        unsigned int req_cpu = cpu;
+        unsigned long flags;
-        /*
-         * While a work item is PENDING && off queue, a task trying to
-         * steal the PENDING will busy-loop waiting for it to either get
-         * queued or lose PENDING.  Grabbing PENDING and queueing should
-         * happen with IRQ disabled.
-         */
-        WARN_ON_ONCE(!irqs_disabled());
        debug_work_activate(work);
@@ -1239,22 +999,21 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        if (!(wq->flags & WQ_UNBOUND)) {
                struct global_cwq *last_gcwq;
-                if (cpu == WORK_CPU_UNBOUND)
+                if (unlikely(cpu == WORK_CPU_UNBOUND))
                        cpu = raw_smp_processor_id();
                /*
-                 * It's multi cpu.  If @work was previously on a different
+                 * It's multi cpu.  If @wq is non-reentrant and @work
-                 * cpu, it might still be running there, in which case the
+                 * was previously on a different cpu, it might still
-                 * work needs to be queued on that cpu to guarantee
+                 * be running there, in which case the work needs to
-                 * non-reentrancy.
+                 * be queued on that cpu to guarantee non-reentrance.
                 */
                gcwq = get_gcwq(cpu);
-                last_gcwq = get_work_gcwq(work);
+                if (wq->flags & WQ_NON_REENTRANT &&
+                    (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
-                if (last_gcwq && last_gcwq != gcwq) {
                        struct worker *worker;
-                        spin_lock(&last_gcwq->lock);
+                        spin_lock_irqsave(&last_gcwq->lock, flags);
                        worker = find_worker_executing_work(last_gcwq, work);
@@ -1262,25 +1021,21 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
                                gcwq = last_gcwq;
                        else {
                                /* meh... not running there, queue here */
-                                spin_unlock(&last_gcwq->lock);
+                                spin_unlock_irqrestore(&last_gcwq->lock, flags);
-                                spin_lock(&gcwq->lock);
+                                spin_lock_irqsave(&gcwq->lock, flags);
                        }
-                } else {
+                } else
-                        spin_lock(&gcwq->lock);
+                        spin_lock_irqsave(&gcwq->lock, flags);
-                }
        } else {
                gcwq = get_gcwq(WORK_CPU_UNBOUND);
-                spin_lock(&gcwq->lock);
+                spin_lock_irqsave(&gcwq->lock, flags);
        }
        /* gcwq determined, get cwq and queue */
        cwq = get_cwq(gcwq->cpu, wq);
-        trace_workqueue_queue_work(req_cpu, cwq, work);
+        trace_workqueue_queue_work(cpu, cwq, work);
-        if (WARN_ON(!list_empty(&work->entry))) {
+        BUG_ON(!list_empty(&work->entry));
-                spin_unlock(&gcwq->lock);
-                return;
-        }
        cwq->nr_in_flight[cwq->work_color]++;
        work_flags = work_color_to_flags(cwq->work_color);
@@ -1288,7 +1043,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        if (likely(cwq->nr_active < cwq->max_active)) {
                trace_workqueue_activate_work(work);
                cwq->nr_active++;
-                worklist = &cwq->pool->worklist;
+                worklist = gcwq_determine_ins_pos(gcwq, cwq);
        } else {
                work_flags |= WORK_STRUCT_DELAYED;
                worklist = &cwq->delayed_works;
@@ -1296,152 +1051,61 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        insert_work(cwq, work, worklist, work_flags);
-        spin_unlock(&gcwq->lock);
+        spin_unlock_irqrestore(&gcwq->lock, flags);
 }
 /**
- * queue_work_on - queue work on specific cpu
+ * queue_work - queue work on a workqueue
- * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @work: work to queue
 *
- * Returns %false if @work was already on a queue, %true otherwise.
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
 *
- * We queue the work to a specific CPU, the caller must ensure it
+ * We queue the work to the CPU on which it was submitted, but if the CPU dies
- * can't go away.
+ * it can be processed by another CPU.
 */
-bool queue_work_on(int cpu, struct workqueue_struct *wq,
+int queue_work(struct workqueue_struct *wq, struct work_struct *work)
-                   struct work_struct *work)
 {
-        bool ret = false;
+        int ret;
-        unsigned long flags;
-        local_irq_save(flags);
-        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+        ret = queue_work_on(get_cpu(), wq, work);
-                __queue_work(cpu, wq, work);
+        put_cpu();
-                ret = true;
-        }
-        local_irq_restore(flags);
        return ret;
 }
-EXPORT_SYMBOL_GPL(queue_work_on);
+EXPORT_SYMBOL_GPL(queue_work);
 /**
- * queue_work - queue work on a workqueue
+ * queue_work_on - queue work on specific cpu
+ * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @work: work to queue
 *
- * Returns %false if @work was already on a queue, %true otherwise.
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
 *
- * We queue the work to the CPU on which it was submitted, but if the CPU dies
+ * We queue the work to a specific CPU, the caller must ensure it
- * it can be processed by another CPU.
+ * can't go away.
 */
-bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
+int
+queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
 {
-        return queue_work_on(WORK_CPU_UNBOUND, wq, work);
+        int ret = 0;
-}
-EXPORT_SYMBOL_GPL(queue_work);
-void delayed_work_timer_fn(unsigned long __data)
-{
-        struct delayed_work *dwork = (struct delayed_work *)__data;
-        struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
-        /* should have been called from irqsafe timer with irq already off */
-        __queue_work(dwork->cpu, cwq->wq, &dwork->work);
-}
-EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
-static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
-                                struct delayed_work *dwork, unsigned long delay)
-{
-        struct timer_list *timer = &dwork->timer;
-        struct work_struct *work = &dwork->work;
-        unsigned int lcpu;
-        WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
-                     timer->data != (unsigned long)dwork);
-        WARN_ON_ONCE(timer_pending(timer));
-        WARN_ON_ONCE(!list_empty(&work->entry));
-        /*
+        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
-         * If @delay is 0, queue @dwork->work immediately.  This is for
+                __queue_work(cpu, wq, work);
-         * both optimization and correctness.  The earliest @timer can
+                ret = 1;
-         * expire is on the closest next tick and delayed_work users depend
-         * on that there's no such delay when @delay is 0.
-         */
-        if (!delay) {
-                __queue_work(cpu, wq, &dwork->work);
-                return;
-        }
-        timer_stats_timer_set_start_info(&dwork->timer);
-        /*
-         * This stores cwq for the moment, for the timer_fn.  Note that the
-         * work's gcwq is preserved to allow reentrance detection for
-         * delayed works.
-         */
-        if (!(wq->flags & WQ_UNBOUND)) {
-                struct global_cwq *gcwq = get_work_gcwq(work);
-                /*
-                 * If we cannot get the last gcwq from @work directly,
-                 * select the last CPU such that it avoids unnecessarily
-                 * triggering non-reentrancy check in __queue_work().
-                 */
-                lcpu = cpu;
-                if (gcwq)
-                        lcpu = gcwq->cpu;
-                if (lcpu == WORK_CPU_UNBOUND)
-                        lcpu = raw_smp_processor_id();
-        } else {
-                lcpu = WORK_CPU_UNBOUND;
        }
+        return ret;
-        set_work_cwq(work, get_cwq(lcpu, wq), 0);
-        dwork->cpu = cpu;
-        timer->expires = jiffies + delay;
-        if (unlikely(cpu != WORK_CPU_UNBOUND))
-                add_timer_on(timer, cpu);
-        else
-                add_timer(timer);
 }
+EXPORT_SYMBOL_GPL(queue_work_on);
-/**
+static void delayed_work_timer_fn(unsigned long __data)
- * queue_delayed_work_on - queue work on specific CPU after delay
- * @cpu: CPU number to execute work on
- * @wq: workqueue to use
- * @dwork: work to queue
- * @delay: number of jiffies to wait before queueing
- *
- * Returns %false if @work was already on a queue, %true otherwise.  If
- * @delay is zero and @dwork is idle, it will be scheduled for immediate
- * execution.
- */
-bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
-                           struct delayed_work *dwork, unsigned long delay)
 {
-        struct work_struct *work = &dwork->work;
+        struct delayed_work *dwork = (struct delayed_work *)__data;
-        bool ret = false;
+        struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
-        unsigned long flags;
-        /* read the comment in __queue_work() */
-        local_irq_save(flags);
-        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
-                __queue_delayed_work(cpu, wq, dwork, delay);
-                ret = true;
-        }
-        local_irq_restore(flags);
+        __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
-        return ret;
 }
-EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 /**
 * queue_delayed_work - queue work on a workqueue after delay
@@ -1449,67 +1113,72 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 * @dwork: delayable work to queue
 * @delay: number of jiffies to wait before queueing
 *
- * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
 */
-bool queue_delayed_work(struct workqueue_struct *wq,
+int queue_delayed_work(struct workqueue_struct *wq,
                        struct delayed_work *dwork, unsigned long delay)
 {
-        return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
+        if (delay == 0)
+                return queue_work(wq, &dwork->work);
+        return queue_delayed_work_on(-1, wq, dwork, delay);
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work);
 /**
- * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
+ * queue_delayed_work_on - queue work on specific CPU after delay
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
- * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
- * modify @dwork's timer so that it expires after @delay.  If @delay is
- * zero, @work is guaranteed to be scheduled immediately regardless of its
- * current state.
- *
- * Returns %false if @dwork was idle and queued, %true if @dwork was
- * pending and its timer was modified.
- *
- * This function is safe to call from any context including IRQ handler.
- * See try_to_grab_pending() for details.
 */
-bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
+int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
-                         struct delayed_work *dwork, unsigned long delay)
+                        struct delayed_work *dwork, unsigned long delay)
 {
-        unsigned long flags;
+        int ret = 0;
-        int ret;
+        struct timer_list *timer = &dwork->timer;
+        struct work_struct *work = &dwork->work;
-        do {
+        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
-                ret = try_to_grab_pending(&dwork->work, true, &flags);
+                unsigned int lcpu;
-        } while (unlikely(ret == -EAGAIN));
-        if (likely(ret >= 0)) {
+                BUG_ON(timer_pending(timer));
-                __queue_delayed_work(cpu, wq, dwork, delay);
+                BUG_ON(!list_empty(&work->entry));
-                local_irq_restore(flags);
-        }
-        /* -ENOENT from try_to_grab_pending() becomes %true */
+                timer_stats_timer_set_start_info(&dwork->timer);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(mod_delayed_work_on);
-/**
+                /*
- * mod_delayed_work - modify delay of or queue a delayed work
+                 * This stores cwq for the moment, for the timer_fn.
- * @wq: workqueue to use
+                 * Note that the work's gcwq is preserved to allow
- * @dwork: work to queue
+                 * reentrance detection for delayed works.
- * @delay: number of jiffies to wait before queueing
+                 */
- *
+                if (!(wq->flags & WQ_UNBOUND)) {
- * mod_delayed_work_on() on local CPU.
+                        struct global_cwq *gcwq = get_work_gcwq(work);
- */
-bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
+                        if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
-                      unsigned long delay)
+                                lcpu = gcwq->cpu;
-{
+                        else
-        return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
+                                lcpu = raw_smp_processor_id();
+                } else
+                        lcpu = WORK_CPU_UNBOUND;
+                set_work_cwq(work, get_cwq(lcpu, wq), 0);
+                timer->expires = jiffies + delay;
+                timer->data = (unsigned long)dwork;
+                timer->function = delayed_work_timer_fn;
+                if (unlikely(cpu >= 0))
+                        add_timer_on(timer, cpu);
+                else
+                        add_timer(timer);
+                ret = 1;
+        }
+        return ret;
 }
-EXPORT_SYMBOL_GPL(mod_delayed_work);
+EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 /**
 * worker_enter_idle - enter idle state
@@ -1523,8 +1192,7 @@ EXPORT_SYMBOL_GPL(mod_delayed_work);
 */
 static void worker_enter_idle(struct worker *worker)
 {
-        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = worker->gcwq;
-        struct global_cwq *gcwq = pool->gcwq;
        BUG_ON(worker->flags & WORKER_IDLE);
        BUG_ON(!list_empty(&worker->entry) &&
@@ -1532,24 +1200,22 @@ static void worker_enter_idle(struct worker *worker)
        /* can't use worker_set_flags(), also called from start_worker() */
        worker->flags |= WORKER_IDLE;
-        pool->nr_idle++;
+        gcwq->nr_idle++;
        worker->last_active = jiffies;
        /* idle_list is LIFO */
-        list_add(&worker->entry, &pool->idle_list);
+        list_add(&worker->entry, &gcwq->idle_list);
-        if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
+        if (likely(!(worker->flags & WORKER_ROGUE))) {
-                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
+                if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
+                        mod_timer(&gcwq->idle_timer,
+                                  jiffies + IDLE_WORKER_TIMEOUT);
+        } else
+                wake_up_all(&gcwq->trustee_wait);
-        /*
+        /* sanity check nr_running */
-         * Sanity check nr_running.  Because gcwq_unbind_fn() releases
+        WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
-         * gcwq->lock between setting %WORKER_UNBOUND and zapping
+                     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
-         * nr_running, the warning may trigger spuriously.  Check iff
-         * unbind is not in progress.
-         */
-        WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
-                     pool->nr_workers == pool->nr_idle &&
-                     atomic_read(get_pool_nr_running(pool)));
 }
 /**
@@ -1563,11 +1229,11 @@ static void worker_enter_idle(struct worker *worker)
 */
 static void worker_leave_idle(struct worker *worker)
 {
-        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = worker->gcwq;
        BUG_ON(!(worker->flags & WORKER_IDLE));
        worker_clr_flags(worker, WORKER_IDLE);
-        pool->nr_idle--;
+        gcwq->nr_idle--;
        list_del_init(&worker->entry);
 }
@@ -1587,11 +1253,11 @@ static void worker_leave_idle(struct worker *worker)
 * verbatim as it's best effort and blocking and gcwq may be
 * [dis]associated in the meantime.
 *
- * This function tries set_cpus_allowed() and locks gcwq and verifies the
+ * This function tries set_cpus_allowed() and locks gcwq and verifies
- * binding against %GCWQ_DISASSOCIATED which is set during
+ * the binding against GCWQ_DISASSOCIATED which is set during
- * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
+ * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
- * enters idle state or fetches works without dropping lock, it can
+ * idle state or fetches works without dropping lock, it can guarantee
- * guarantee the scheduling requirement described in the first paragraph.
+ * the scheduling requirement described in the first paragraph.
 *
 * CONTEXT:
 * Might sleep.  Called without any lock but returns with gcwq->lock
@@ -1604,7 +1270,7 @@ static void worker_leave_idle(struct worker *worker)
 static bool worker_maybe_bind_and_lock(struct worker *worker)
 __acquires(&gcwq->lock)
 {
-        struct global_cwq *gcwq = worker->pool->gcwq;
+        struct global_cwq *gcwq = worker->gcwq;
        struct task_struct *task = worker->task;
        while (true) {
@@ -1638,117 +1304,22 @@ __acquires(&gcwq->lock)
 }
 /*
- * Rebind an idle @worker to its CPU.  worker_thread() will test
+ * Function for worker->rebind_work used to rebind rogue busy workers
- * list_empty(@worker->entry) before leaving idle and call this function.
+ * to the associated cpu which is coming back online.  This is
- */
+ * scheduled by cpu up but can race with other cpu hotplug operations
-static void idle_worker_rebind(struct worker *worker)
+ * and may be executed twice without intervening cpu down.
-{
-        struct global_cwq *gcwq = worker->pool->gcwq;
-        /* CPU may go down again inbetween, clear UNBOUND only on success */
-        if (worker_maybe_bind_and_lock(worker))
-                worker_clr_flags(worker, WORKER_UNBOUND);
-        /* rebind complete, become available again */
-        list_add(&worker->entry, &worker->pool->idle_list);
-        spin_unlock_irq(&gcwq->lock);
-}
-/*
- * Function for @worker->rebind.work used to rebind unbound busy workers to
- * the associated cpu which is coming back online.  This is scheduled by
- * cpu up but can race with other cpu hotplug operations and may be
- * executed twice without intervening cpu down.
 */
-static void busy_worker_rebind_fn(struct work_struct *work)
+static void worker_rebind_fn(struct work_struct *work)
 {
        struct worker *worker = container_of(work, struct worker, rebind_work);
-        struct global_cwq *gcwq = worker->pool->gcwq;
+        struct global_cwq *gcwq = worker->gcwq;
        if (worker_maybe_bind_and_lock(worker))
-                worker_clr_flags(worker, WORKER_UNBOUND);
+                worker_clr_flags(worker, WORKER_REBIND);
        spin_unlock_irq(&gcwq->lock);
 }
-/**
- * rebind_workers - rebind all workers of a gcwq to the associated CPU
- * @gcwq: gcwq of interest
- *
- * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
- * is different for idle and busy ones.
- *
- * Idle ones will be removed from the idle_list and woken up.  They will
- * add themselves back after completing rebind.  This ensures that the
- * idle_list doesn't contain any unbound workers when re-bound busy workers
- * try to perform local wake-ups for concurrency management.
- *
- * Busy workers can rebind after they finish their current work items.
- * Queueing the rebind work item at the head of the scheduled list is
- * enough.  Note that nr_running will be properly bumped as busy workers
- * rebind.
- *
- * On return, all non-manager workers are scheduled for rebind - see
- * manage_workers() for the manager special case.  Any idle worker
- * including the manager will not appear on @idle_list until rebind is
- * complete, making local wake-ups safe.
- */
-static void rebind_workers(struct global_cwq *gcwq)
-{
-        struct worker_pool *pool;
-        struct worker *worker, *n;
-        struct hlist_node *pos;
-        int i;
-        lockdep_assert_held(&gcwq->lock);
-        for_each_worker_pool(pool, gcwq)
-                lockdep_assert_held(&pool->assoc_mutex);
-        /* dequeue and kick idle ones */
-        for_each_worker_pool(pool, gcwq) {
-                list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
-                        /*
-                         * idle workers should be off @pool->idle_list
-                         * until rebind is complete to avoid receiving
-                         * premature local wake-ups.
-                         */
-                        list_del_init(&worker->entry);
-                        /*
-                         * worker_thread() will see the above dequeuing
-                         * and call idle_worker_rebind().
-                         */
-                        wake_up_process(worker->task);
-                }
-        }
-        /* rebind busy workers */
-        for_each_busy_worker(worker, i, pos, gcwq) {
-                struct work_struct *rebind_work = &worker->rebind_work;
-                struct workqueue_struct *wq;
-                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
-                                     work_data_bits(rebind_work)))
-                        continue;
-                debug_work_activate(rebind_work);
-                /*
-                 * wq doesn't really matter but let's keep @worker->pool
-                 * and @cwq->pool consistent for sanity.
-                 */
-                if (worker_pool_pri(worker->pool))
-                        wq = system_highpri_wq;
-                else
-                        wq = system_wq;
-                insert_work(get_cwq(gcwq->cpu, wq), rebind_work,
-                        worker->scheduled.next,
-                        work_color_to_flags(WORK_NO_COLOR));
-        }
-}
 static struct worker *alloc_worker(void)
 {
        struct worker *worker;
@@ -1757,7 +1328,7 @@ static struct worker *alloc_worker(void)
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
-                INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
+                INIT_WORK(&worker->rebind_work, worker_rebind_fn);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
@@ -1766,9 +1337,10 @@ static struct worker *alloc_worker(void)
 /**
 * create_worker - create a new workqueue worker
- * @pool: pool the new worker will belong to
+ * @gcwq: gcwq the new worker will belong to
+ * @bind: whether to set affinity to @cpu or not
 *
- * Create a new worker which is bound to @pool.  The returned worker
+ * Create a new worker which is bound to @gcwq.  The returned worker
 * can be started by calling start_worker() or destroyed using
 * destroy_worker().
 *
@@ -1778,17 +1350,16 @@ static struct worker *alloc_worker(void)
 * RETURNS:
 * Pointer to the newly created worker.
 */
-static struct worker *create_worker(struct worker_pool *pool)
+static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 {
-        struct global_cwq *gcwq = pool->gcwq;
+        bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
-        const char *pri = worker_pool_pri(pool) ? "H" : "";
        struct worker *worker = NULL;
        int id = -1;
        spin_lock_irq(&gcwq->lock);
-        while (ida_get_new(&pool->worker_ida, &id)) {
+        while (ida_get_new(&gcwq->worker_ida, &id)) {
                spin_unlock_irq(&gcwq->lock);
-                if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
+                if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
                        goto fail;
                spin_lock_irq(&gcwq->lock);
        }
@@ -1798,43 +1369,38 @@ static struct worker *create_worker(struct worker_pool *pool)
        if (!worker)
                goto fail;
-        worker->pool = pool;
+        worker->gcwq = gcwq;
        worker->id = id;
-        if (gcwq->cpu != WORK_CPU_UNBOUND)
+        if (!on_unbound_cpu)
                worker->task = kthread_create_on_node(worker_thread,
-                                        worker, cpu_to_node(gcwq->cpu),
+                                                      worker,
-                                        "kworker/%u:%d%s", gcwq->cpu, id, pri);
+                                                      cpu_to_node(gcwq->cpu),
+                                                      "kworker/%u:%d", gcwq->cpu, id);
        else
                worker->task = kthread_create(worker_thread, worker,
-                                              "kworker/u:%d%s", id, pri);
+                                              "kworker/u:%d", id);
        if (IS_ERR(worker->task))
                goto fail;
-        if (worker_pool_pri(pool))
-                set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
        /*
-         * Determine CPU binding of the new worker depending on
+         * A rogue worker will become a regular one if CPU comes
-         * %GCWQ_DISASSOCIATED.  The caller is responsible for ensuring the
+         * online later on.  Make sure every worker has
-         * flag remains stable across this function.  See the comments
+         * PF_THREAD_BOUND set.
-         * above the flag definition for details.
-         *
-         * As an unbound worker may later become a regular one if CPU comes
-         * online, make sure every worker has %PF_THREAD_BOUND set.
         */
-        if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
+        if (bind && !on_unbound_cpu)
                kthread_bind(worker->task, gcwq->cpu);
-        } else {
+        else {
                worker->task->flags |= PF_THREAD_BOUND;
-                worker->flags |= WORKER_UNBOUND;
+                if (on_unbound_cpu)
+                        worker->flags |= WORKER_UNBOUND;
        }
        return worker;
 fail:
        if (id >= 0) {
                spin_lock_irq(&gcwq->lock);
-                ida_remove(&pool->worker_ida, id);
+                ida_remove(&gcwq->worker_ida, id);
                spin_unlock_irq(&gcwq->lock);
        }
        kfree(worker);
@@ -1853,7 +1419,7 @@ fail:
 static void start_worker(struct worker *worker)
 {
        worker->flags |= WORKER_STARTED;
-        worker->pool->nr_workers++;
+        worker->gcwq->nr_workers++;
        worker_enter_idle(worker);
        wake_up_process(worker->task);
 }
@@ -1869,8 +1435,7 @@ static void start_worker(struct worker *worker)
 */
 static void destroy_worker(struct worker *worker)
 {
-        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = worker->gcwq;
-        struct global_cwq *gcwq = pool->gcwq;
        int id = worker->id;
        /* sanity check frenzy */
@@ -1878,9 +1443,9 @@ static void destroy_worker(struct worker *worker)
        BUG_ON(!list_empty(&worker->scheduled));
        if (worker->flags & WORKER_STARTED)
-                pool->nr_workers--;
+                gcwq->nr_workers--;
        if (worker->flags & WORKER_IDLE)
-                pool->nr_idle--;
+                gcwq->nr_idle--;
        list_del_init(&worker->entry);
        worker->flags |= WORKER_DIE;
@@ -1891,30 +1456,29 @@ static void destroy_worker(struct worker *worker)
        kfree(worker);
        spin_lock_irq(&gcwq->lock);
-        ida_remove(&pool->worker_ida, id);
+        ida_remove(&gcwq->worker_ida, id);
 }
-static void idle_worker_timeout(unsigned long __pool)
+static void idle_worker_timeout(unsigned long __gcwq)
 {
-        struct worker_pool *pool = (void *)__pool;
+        struct global_cwq *gcwq = (void *)__gcwq;
-        struct global_cwq *gcwq = pool->gcwq;
        spin_lock_irq(&gcwq->lock);
-        if (too_many_workers(pool)) {
+        if (too_many_workers(gcwq)) {
                struct worker *worker;
                unsigned long expires;
                /* idle_list is kept in LIFO order, check the last one */
-                worker = list_entry(pool->idle_list.prev, struct worker, entry);
+                worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
                if (time_before(jiffies, expires))
-                        mod_timer(&pool->idle_timer, expires);
+                        mod_timer(&gcwq->idle_timer, expires);
                else {
                        /* it's been idle for too long, wake up manager */
-                        pool->flags |= POOL_MANAGE_WORKERS;
+                        gcwq->flags |= GCWQ_MANAGE_WORKERS;
-                        wake_up_worker(pool);
+                        wake_up_worker(gcwq);
                }
        }
@@ -1931,7 +1495,7 @@ static bool send_mayday(struct work_struct *work)
                return false;
        /* mayday mayday mayday */
-        cpu = cwq->pool->gcwq->cpu;
+        cpu = cwq->gcwq->cpu;
        /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
        if (cpu == WORK_CPU_UNBOUND)
                cpu = 0;
@@ -1940,38 +1504,37 @@ static bool send_mayday(struct work_struct *work)
        return true;
 }
-static void gcwq_mayday_timeout(unsigned long __pool)
+static void gcwq_mayday_timeout(unsigned long __gcwq)
 {
-        struct worker_pool *pool = (void *)__pool;
+        struct global_cwq *gcwq = (void *)__gcwq;
-        struct global_cwq *gcwq = pool->gcwq;
        struct work_struct *work;
        spin_lock_irq(&gcwq->lock);
-        if (need_to_create_worker(pool)) {
+        if (need_to_create_worker(gcwq)) {
                /*
                 * We've been trying to create a new worker but
                 * haven't been successful.  We might be hitting an
                 * allocation deadlock.  Send distress signals to
                 * rescuers.
                 */
-                list_for_each_entry(work, &pool->worklist, entry)
+                list_for_each_entry(work, &gcwq->worklist, entry)
                        send_mayday(work);
        }
        spin_unlock_irq(&gcwq->lock);
-        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
+        mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
 /**
 * maybe_create_worker - create a new worker if necessary
- * @pool: pool to create a new worker for
+ * @gcwq: gcwq to create a new worker for
 *
- * Create a new worker for @pool if necessary.  @pool is guaranteed to
+ * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
 * have at least one idle worker on return from this function.  If
 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
- * sent to all rescuers with works scheduled on @pool to resolve
+ * sent to all rescuers with works scheduled on @gcwq to resolve
 * possible allocation deadlock.
 *
 * On return, need_to_create_worker() is guaranteed to be false and
@@ -1986,54 +1549,52 @@ static void gcwq_mayday_timeout(unsigned long __pool)
 * false if no action was taken and gcwq->lock stayed locked, true
 * otherwise.
 */
-static bool maybe_create_worker(struct worker_pool *pool)
+static bool maybe_create_worker(struct global_cwq *gcwq)
 __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
-        struct global_cwq *gcwq = pool->gcwq;
+        if (!need_to_create_worker(gcwq))
-        if (!need_to_create_worker(pool))
                return false;
 restart:
        spin_unlock_irq(&gcwq->lock);
        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
-        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+        mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
        while (true) {
                struct worker *worker;
-                worker = create_worker(pool);
+                worker = create_worker(gcwq, true);
                if (worker) {
-                        del_timer_sync(&pool->mayday_timer);
+                        del_timer_sync(&gcwq->mayday_timer);
                        spin_lock_irq(&gcwq->lock);
                        start_worker(worker);
-                        BUG_ON(need_to_create_worker(pool));
+                        BUG_ON(need_to_create_worker(gcwq));
                        return true;
                }
-                if (!need_to_create_worker(pool))
+                if (!need_to_create_worker(gcwq))
                        break;
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(CREATE_COOLDOWN);
-                if (!need_to_create_worker(pool))
+                if (!need_to_create_worker(gcwq))
                        break;
        }
-        del_timer_sync(&pool->mayday_timer);
+        del_timer_sync(&gcwq->mayday_timer);
        spin_lock_irq(&gcwq->lock);
-        if (need_to_create_worker(pool))
+        if (need_to_create_worker(gcwq))
                goto restart;
        return true;
 }
 /**
 * maybe_destroy_worker - destroy workers which have been idle for a while
- * @pool: pool to destroy workers for
+ * @gcwq: gcwq to destroy workers for
 *
- * Destroy @pool workers which have been idle for longer than
+ * Destroy @gcwq workers which have been idle for longer than
 * IDLE_WORKER_TIMEOUT.
 *
 * LOCKING:
@@ -2044,19 +1605,19 @@ restart:
 * false if no action was taken and gcwq->lock stayed locked, true
 * otherwise.
 */
-static bool maybe_destroy_workers(struct worker_pool *pool)
+static bool maybe_destroy_workers(struct global_cwq *gcwq)
 {
        bool ret = false;
-        while (too_many_workers(pool)) {
+        while (too_many_workers(gcwq)) {
                struct worker *worker;
                unsigned long expires;
-                worker = list_entry(pool->idle_list.prev, struct worker, entry);
+                worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
                if (time_before(jiffies, expires)) {
-                        mod_timer(&pool->idle_timer, expires);
+                        mod_timer(&gcwq->idle_timer, expires);
                        break;
                }
@@ -2089,63 +1650,137 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
 */
 static bool manage_workers(struct worker *worker)
 {
-        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = worker->gcwq;
        bool ret = false;
-        if (pool->flags & POOL_MANAGING_WORKERS)
+        if (gcwq->flags & GCWQ_MANAGING_WORKERS)
                return ret;
-        pool->flags |= POOL_MANAGING_WORKERS;
+        gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
+        gcwq->flags |= GCWQ_MANAGING_WORKERS;
        /*
-         * To simplify both worker management and CPU hotplug, hold off
+         * Destroy and then create so that may_start_working() is true
-         * management while hotplug is in progress.  CPU hotplug path can't
+         * on return.
-         * grab %POOL_MANAGING_WORKERS to achieve this because that can
-         * lead to idle worker depletion (all become busy thinking someone
-         * else is managing) which in turn can result in deadlock under
-         * extreme circumstances.  Use @pool->assoc_mutex to synchronize
-         * manager against CPU hotplug.
-         *
-         * assoc_mutex would always be free unless CPU hotplug is in
-         * progress.  trylock first without dropping @gcwq->lock.
         */
-        if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
+        ret |= maybe_destroy_workers(gcwq);
-                spin_unlock_irq(&pool->gcwq->lock);
+        ret |= maybe_create_worker(gcwq);
-                mutex_lock(&pool->assoc_mutex);
-                /*
-                 * CPU hotplug could have happened while we were waiting
-                 * for assoc_mutex.  Hotplug itself can't handle us
-                 * because manager isn't either on idle or busy list, and
-                 * @gcwq's state and ours could have deviated.
-                 *
-                 * As hotplug is now excluded via assoc_mutex, we can
-                 * simply try to bind.  It will succeed or fail depending
-                 * on @gcwq's current state.  Try it and adjust
-                 * %WORKER_UNBOUND accordingly.
-                 */
-                if (worker_maybe_bind_and_lock(worker))
-                        worker->flags &= ~WORKER_UNBOUND;
-                else
-                        worker->flags |= WORKER_UNBOUND;
-                ret = true;
+        gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
-        }
-        pool->flags &= ~POOL_MANAGE_WORKERS;
        /*
-         * Destroy and then create so that may_start_working() is true
+         * The trustee might be waiting to take over the manager
-         * on return.
+         * position, tell it we're done.
         */
-        ret |= maybe_destroy_workers(pool);
+        if (unlikely(gcwq->trustee))
-        ret |= maybe_create_worker(pool);
+                wake_up_all(&gcwq->trustee_wait);
-        pool->flags &= ~POOL_MANAGING_WORKERS;
-        mutex_unlock(&pool->assoc_mutex);
        return ret;
 }
 /**
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out paramter for nested worklist walking
+ *
+ * Schedule linked works starting from @work to @head.  Work series to
+ * be scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of
+ * the last scheduled work.  This allows move_linked_works() to be
+ * nested inside outer list_for_each_entry_safe().
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+                              struct work_struct **nextp)
+{
+        struct work_struct *n;
+        /*
+         * Linked worklist will always end before the end of the list,
+         * use NULL for list head.
+         */
+        list_for_each_entry_safe_from(work, n, NULL, entry) {
+                list_move_tail(&work->entry, head);
+                if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
+                        break;
+        }
+        /*
+         * If we're already inside safe list traversal and have moved
+         * multiple works to the scheduled queue, the next position
+         * needs to be updated.
+         */
+        if (nextp)
+                *nextp = n;
+}
+static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+{
+        struct work_struct *work = list_first_entry(&cwq->delayed_works,
+                                                    struct work_struct, entry);
+        struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
+        trace_workqueue_activate_work(work);
+        move_linked_works(work, pos, NULL);
+        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
+        cwq->nr_active++;
+}
+/**
+ * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
+ * @cwq: cwq of interest
+ * @color: color of work which left the queue
+ * @delayed: for a delayed work
+ *
+ * A work either has completed or is removed from pending queue,
+ * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
+                                 bool delayed)
+{
+        /* ignore uncolored works */
+        if (color == WORK_NO_COLOR)
+                return;
+        cwq->nr_in_flight[color]--;
+        if (!delayed) {
+                cwq->nr_active--;
+                if (!list_empty(&cwq->delayed_works)) {
+                        /* one down, submit a delayed one */
+                        if (cwq->nr_active < cwq->max_active)
+                                cwq_activate_first_delayed(cwq);
+                }
+        }
+        /* is flush in progress and are we at the flushing tip? */
+        if (likely(cwq->flush_color != color))
+                return;
+        /* are there still in-flight works? */
+        if (cwq->nr_in_flight[color])
+                return;
+        /* this cwq is done, clear flush_color */
+        cwq->flush_color = -1;
+        /*
+         * If this was the last cwq, wake up the first flusher.  It
+         * will handle the rest.
+         */
+        if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
+                complete(&cwq->wq->first_flusher->done);
+}
+/**
 * process_one_work - process single work
 * @worker: self
 * @work: work to process
@@ -2164,8 +1799,7 @@ __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
-        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = cwq->gcwq;
-        struct global_cwq *gcwq = pool->gcwq;
        struct hlist_head *bwh = busy_worker_head(gcwq, work);
        bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
        work_func_t f = work->func;
@@ -2179,20 +1813,9 @@ __acquires(&gcwq->lock)
         * lock freed" warnings as well as problems when looking into
         * work->lockdep_map, make a copy and use that here.
         */
-        struct lockdep_map lockdep_map;
+        struct lockdep_map lockdep_map = work->lockdep_map;
-        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
        /*
-         * Ensure we're on the correct CPU.  DISASSOCIATED test is
-         * necessary to avoid spurious warnings from rescuers servicing the
-         * unbound or a disassociated gcwq.
-         */
-        WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
-                     !(gcwq->flags & GCWQ_DISASSOCIATED) &&
-                     raw_smp_processor_id() != gcwq->cpu);
-        /*
         * A single work shouldn't be executed concurrently by
         * multiple workers on a single cpu.  Check whether anyone is
         * already processing the work.  If so, defer the work to the
@@ -2204,39 +1827,42 @@ __acquires(&gcwq->lock)
                return;
        }
-        /* claim and dequeue */
+        /* claim and process */
        debug_work_deactivate(work);
        hlist_add_head(&worker->hentry, bwh);
        worker->current_work = work;
        worker->current_cwq = cwq;
        work_color = get_work_color(work);
+        /* record the current cpu number in the work data and dequeue */
+        set_work_cpu(work, gcwq->cpu);
        list_del_init(&work->entry);
        /*
-         * CPU intensive works don't participate in concurrency
+         * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
-         * management.  They're the scheduler's responsibility.
+         * wake up another worker; otherwise, clear HIGHPRI_PENDING.
         */
-        if (unlikely(cpu_intensive))
+        if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
-                worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+                struct work_struct *nwork = list_first_entry(&gcwq->worklist,
+                                                struct work_struct, entry);
-        /*
+                if (!list_empty(&gcwq->worklist) &&
-         * Unbound gcwq isn't concurrency managed and work items should be
+                    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
-         * executed ASAP.  Wake up another worker if necessary.
+                        wake_up_worker(gcwq);
-         */
+                else
-        if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+                        gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
-                wake_up_worker(pool);
+        }
        /*
-         * Record the last CPU and clear PENDING which should be the last
+         * CPU intensive works don't participate in concurrency
-         * update to @work.  Also, do this inside @gcwq->lock so that
+         * management.  They're the scheduler's responsibility.
-         * PENDING and queued state changes happen together while IRQ is
-         * disabled.
         */
-        set_work_cpu_and_clear_pending(work, gcwq->cpu);
+        if (unlikely(cpu_intensive))
+                worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
        spin_unlock_irq(&gcwq->lock);
+        work_clear_pending(work);
        lock_map_acquire_read(&cwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        trace_workqueue_execute_start(work);
@@ -2250,9 +1876,11 @@ __acquires(&gcwq->lock)
        lock_map_release(&cwq->wq->lockdep_map);
        if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
-                pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
+                printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
-                       "     last function: %pf\n",
+                       "%s/0x%08x/%d\n",
-                       current->comm, preempt_count(), task_pid_nr(current), f);
+                       current->comm, preempt_count(), task_pid_nr(current));
+                printk(KERN_ERR "    last function: ");
+                print_symbol("%s\n", (unsigned long)f);
                debug_show_held_locks(current);
                dump_stack();
        }
@@ -2267,7 +1895,7 @@ __acquires(&gcwq->lock)
        hlist_del_init(&worker->hentry);
        worker->current_work = NULL;
        worker->current_cwq = NULL;
-        cwq_dec_nr_in_flight(cwq, work_color);
+        cwq_dec_nr_in_flight(cwq, work_color, false);
 }
 /**
@@ -2304,37 +1932,28 @@ static void process_scheduled_works(struct worker *worker)
 static int worker_thread(void *__worker)
 {
        struct worker *worker = __worker;
-        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = worker->gcwq;
-        struct global_cwq *gcwq = pool->gcwq;
        /* tell the scheduler that this is a workqueue worker */
        worker->task->flags |= PF_WQ_WORKER;
 woke_up:
        spin_lock_irq(&gcwq->lock);
-        /* we are off idle list if destruction or rebind is requested */
+        /* DIE can be set only while we're idle, checking here is enough */
-        if (unlikely(list_empty(&worker->entry))) {
+        if (worker->flags & WORKER_DIE) {
                spin_unlock_irq(&gcwq->lock);
+                worker->task->flags &= ~PF_WQ_WORKER;
-                /* if DIE is set, destruction is requested */
+                return 0;
-                if (worker->flags & WORKER_DIE) {
-                        worker->task->flags &= ~PF_WQ_WORKER;
-                        return 0;
-                }
-                /* otherwise, rebind */
-                idle_worker_rebind(worker);
-                goto woke_up;
        }
        worker_leave_idle(worker);
 recheck:
        /* no more worker necessary? */
-        if (!need_more_worker(pool))
+        if (!need_more_worker(gcwq))
                goto sleep;
        /* do we need to manage? */
-        if (unlikely(!may_start_working(pool)) && manage_workers(worker))
+        if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
                goto recheck;
        /*
@@ -2353,7 +1972,7 @@ recheck:
        do {
                struct work_struct *work =
-                        list_first_entry(&pool->worklist,
+                        list_first_entry(&gcwq->worklist,
                                         struct work_struct, entry);
                if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
@@ -2365,11 +1984,11 @@ recheck:
                        move_linked_works(work, &worker->scheduled, NULL);
                        process_scheduled_works(worker);
                }
-        } while (keep_working(pool));
+        } while (keep_working(gcwq));
        worker_set_flags(worker, WORKER_PREP, false);
 sleep:
-        if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
+        if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
                goto recheck;
        /*
@@ -2417,10 +2036,8 @@ static int rescuer_thread(void *__wq)
 repeat:
        set_current_state(TASK_INTERRUPTIBLE);
-        if (kthread_should_stop()) {
+        if (kthread_should_stop())
-                __set_current_state(TASK_RUNNING);
                return 0;
-        }
        /*
         * See whether any cpu is asking for help.  Unbounded
@@ -2429,15 +2046,14 @@ repeat:
        for_each_mayday_cpu(cpu, wq->mayday_mask) {
                unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
                struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
-                struct worker_pool *pool = cwq->pool;
+                struct global_cwq *gcwq = cwq->gcwq;
-                struct global_cwq *gcwq = pool->gcwq;
                struct work_struct *work, *n;
                __set_current_state(TASK_RUNNING);
                mayday_clear_cpu(cpu, wq->mayday_mask);
                /* migrate to the target cpu if possible */
-                rescuer->pool = pool;
+                rescuer->gcwq = gcwq;
                worker_maybe_bind_and_lock(rescuer);
                /*
@@ -2445,7 +2061,7 @@ repeat:
                 * process'em.
                 */
                BUG_ON(!list_empty(&rescuer->scheduled));
-                list_for_each_entry_safe(work, n, &pool->worklist, entry)
+                list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
                        if (get_work_cwq(work) == cwq)
                                move_linked_works(work, scheduled, &n);
@@ -2456,8 +2072,8 @@ repeat:
                 * regular worker; otherwise, we end up with 0 concurrency
                 * and stalling the execution.
                 */
-                if (keep_working(pool))
+                if (keep_working(gcwq))
-                        wake_up_worker(pool);
+                        wake_up_worker(gcwq);
                spin_unlock_irq(&gcwq->lock);
        }
@@ -2582,7 +2198,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
        for_each_cwq_cpu(cpu, wq) {
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-                struct global_cwq *gcwq = cwq->pool->gcwq;
+                struct global_cwq *gcwq = cwq->gcwq;
                spin_lock_irq(&gcwq->lock);
@@ -2798,17 +2414,17 @@ reflush:
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
                bool drained;
-                spin_lock_irq(&cwq->pool->gcwq->lock);
+                spin_lock_irq(&cwq->gcwq->lock);
                drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
-                spin_unlock_irq(&cwq->pool->gcwq->lock);
+                spin_unlock_irq(&cwq->gcwq->lock);
                if (drained)
                        continue;
                if (++flush_cnt == 10 ||
                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
-                        pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n",
+                        pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
-                                wq->name, flush_cnt);
+                                   wq->name, flush_cnt);
                goto reflush;
        }
@@ -2819,7 +2435,8 @@ reflush:
 }
 EXPORT_SYMBOL_GPL(drain_workqueue);
-static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
+static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
+                             bool wait_executing)
 {
        struct worker *worker = NULL;
        struct global_cwq *gcwq;
@@ -2839,14 +2456,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
                 */
                smp_rmb();
                cwq = get_work_cwq(work);
-                if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
+                if (unlikely(!cwq || gcwq != cwq->gcwq))
                        goto already_gone;
-        } else {
+        } else if (wait_executing) {
                worker = find_worker_executing_work(gcwq, work);
                if (!worker)
                        goto already_gone;
                cwq = worker->current_cwq;
-        }
+        } else
+                goto already_gone;
        insert_wq_barrier(cwq, barr, work, worker);
        spin_unlock_irq(&gcwq->lock);
@@ -2873,8 +2491,15 @@ already_gone:
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
 *
- * Wait until @work has finished execution.  @work is guaranteed to be idle
+ * Wait until @work has finished execution.  This function considers
- * on return if it hasn't been requeued since flush started.
+ * only the last queueing instance of @work.  If @work has been
+ * enqueued across different CPUs on a non-reentrant workqueue or on
+ * multiple workqueues, @work might still be executing on return on
+ * some of the CPUs from earlier queueing.
+ *
+ * If @work was queued only on a non-reentrant, ordered or unbound
+ * workqueue, @work is guaranteed to be idle on return if it hasn't
+ * been requeued since flush started.
 *
 * RETURNS:
 * %true if flush_work() waited for the work to finish execution,
@@ -2884,39 +2509,140 @@ bool flush_work(struct work_struct *work)
 {
        struct wq_barrier barr;
-        lock_map_acquire(&work->lockdep_map);
+        if (start_flush_work(work, &barr, true)) {
-        lock_map_release(&work->lockdep_map);
+                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+                return true;
+        } else
+                return false;
+}
+EXPORT_SYMBOL_GPL(flush_work);
+static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
+{
+        struct wq_barrier barr;
+        struct worker *worker;
-        if (start_flush_work(work, &barr)) {
+        spin_lock_irq(&gcwq->lock);
+        worker = find_worker_executing_work(gcwq, work);
+        if (unlikely(worker))
+                insert_wq_barrier(worker->current_cwq, &barr, work, worker);
+        spin_unlock_irq(&gcwq->lock);
+        if (unlikely(worker)) {
                wait_for_completion(&barr.done);
                destroy_work_on_stack(&barr.work);
                return true;
-        } else {
+        } else
                return false;
+}
+static bool wait_on_work(struct work_struct *work)
+{
+        bool ret = false;
+        int cpu;
+        might_sleep();
+        lock_map_acquire(&work->lockdep_map);
+        lock_map_release(&work->lockdep_map);
+        for_each_gcwq_cpu(cpu)
+                ret |= wait_on_cpu_work(get_gcwq(cpu), work);
+        return ret;
+}
+/**
+ * flush_work_sync - wait until a work has finished execution
+ * @work: the work to flush
+ *
+ * Wait until @work has finished execution.  On return, it's
+ * guaranteed that all queueing instances of @work which happened
+ * before this function is called are finished.  In other words, if
+ * @work hasn't been requeued since this function was called, @work is
+ * guaranteed to be idle on return.
+ *
+ * RETURNS:
+ * %true if flush_work_sync() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_work_sync(struct work_struct *work)
+{
+        struct wq_barrier barr;
+        bool pending, waited;
+        /* we'll wait for executions separately, queue barr only if pending */
+        pending = start_flush_work(work, &barr, false);
+        /* wait for executions to finish */
+        waited = wait_on_work(work);
+        /* wait for the pending one */
+        if (pending) {
+                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
        }
+        return pending || waited;
 }
-EXPORT_SYMBOL_GPL(flush_work);
+EXPORT_SYMBOL_GPL(flush_work_sync);
-static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
+/*
+ * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
+ * so this work can't be re-armed in any way.
+ */
+static int try_to_grab_pending(struct work_struct *work)
 {
-        unsigned long flags;
+        struct global_cwq *gcwq;
-        int ret;
+        int ret = -1;
-        do {
+        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
-                ret = try_to_grab_pending(work, is_dwork, &flags);
+                return 0;
+        /*
+         * The queueing is in progress, or it is already queued. Try to
+         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+         */
+        gcwq = get_work_gcwq(work);
+        if (!gcwq)
+                return ret;
+        spin_lock_irq(&gcwq->lock);
+        if (!list_empty(&work->entry)) {
                /*
-                 * If someone else is canceling, wait for the same event it
+                 * This work is queued, but perhaps we locked the wrong gcwq.
-                 * would be waiting for before retrying.
+                 * In that case we must see the new value after rmb(), see
+                 * insert_work()->wmb().
                 */
-                if (unlikely(ret == -ENOENT))
+                smp_rmb();
-                        flush_work(work);
+                if (gcwq == get_work_gcwq(work)) {
-        } while (unlikely(ret < 0));
+                        debug_work_deactivate(work);
+                        list_del_init(&work->entry);
+                        cwq_dec_nr_in_flight(get_work_cwq(work),
+                                get_work_color(work),
+                                *work_data_bits(work) & WORK_STRUCT_DELAYED);
+                        ret = 1;
+                }
+        }
+        spin_unlock_irq(&gcwq->lock);
-        /* tell other tasks trying to grab @work to back off */
+        return ret;
-        mark_work_canceling(work);
+}
-        local_irq_restore(flags);
+static bool __cancel_work_timer(struct work_struct *work,
+                                struct timer_list* timer)
+{
+        int ret;
+        do {
+                ret = (timer && likely(del_timer(timer)));
+                if (!ret)
+                        ret = try_to_grab_pending(work);
+                wait_on_work(work);
+        } while (unlikely(ret < 0));
-        flush_work(work);
        clear_work_data(work);
        return ret;
 }
@@ -2941,7 +2667,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 */
 bool cancel_work_sync(struct work_struct *work)
 {
-        return __cancel_work_timer(work, false);
+        return __cancel_work_timer(work, NULL);
 }
 EXPORT_SYMBOL_GPL(cancel_work_sync);
@@ -2959,44 +2685,33 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
 */
 bool flush_delayed_work(struct delayed_work *dwork)
 {
-        local_irq_disable();
        if (del_timer_sync(&dwork->timer))
-                __queue_work(dwork->cpu,
+                __queue_work(raw_smp_processor_id(),
                             get_work_cwq(&dwork->work)->wq, &dwork->work);
-        local_irq_enable();
        return flush_work(&dwork->work);
 }
 EXPORT_SYMBOL(flush_delayed_work);
 /**
- * cancel_delayed_work - cancel a delayed work
+ * flush_delayed_work_sync - wait for a dwork to finish
- * @dwork: delayed_work to cancel
+ * @dwork: the delayed work to flush
 *
- * Kill off a pending delayed_work.  Returns %true if @dwork was pending
+ * Delayed timer is cancelled and the pending work is queued for
- * and canceled; %false if wasn't pending.  Note that the work callback
+ * execution immediately.  Other than timer handling, its behavior
- * function may still be running on return, unless it returns %true and the
+ * is identical to flush_work_sync().
- * work doesn't re-arm itself.  Explicitly flush or use
- * cancel_delayed_work_sync() to wait on it.
 *
- * This function is safe to call from any context including IRQ handler.
+ * RETURNS:
+ * %true if flush_work_sync() waited for the work to finish execution,
+ * %false if it was already idle.
 */
-bool cancel_delayed_work(struct delayed_work *dwork)
+bool flush_delayed_work_sync(struct delayed_work *dwork)
 {
-        unsigned long flags;
+        if (del_timer_sync(&dwork->timer))
-        int ret;
+                __queue_work(raw_smp_processor_id(),
+                             get_work_cwq(&dwork->work)->wq, &dwork->work);
-        do {
+        return flush_work_sync(&dwork->work);
-                ret = try_to_grab_pending(&dwork->work, true, &flags);
-        } while (unlikely(ret == -EAGAIN));
-        if (unlikely(ret < 0))
-                return false;
-        set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
-        local_irq_restore(flags);
-        return ret;
 }
-EXPORT_SYMBOL(cancel_delayed_work);
+EXPORT_SYMBOL(flush_delayed_work_sync);
 /**
 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
@@ -3009,55 +2724,39 @@ EXPORT_SYMBOL(cancel_delayed_work);
 */
 bool cancel_delayed_work_sync(struct delayed_work *dwork)
 {
-        return __cancel_work_timer(&dwork->work, true);
+        return __cancel_work_timer(&dwork->work, &dwork->timer);
 }
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 /**
- * schedule_work_on - put work task on a specific cpu
- * @cpu: cpu to put the work task on
- * @work: job to be done
- *
- * This puts a job on a specific cpu
- */
-bool schedule_work_on(int cpu, struct work_struct *work)
-{
-        return queue_work_on(cpu, system_wq, work);
-}
-EXPORT_SYMBOL(schedule_work_on);
-/**
 * schedule_work - put work task in global workqueue
 * @work: job to be done
 *
- * Returns %false if @work was already on the kernel-global workqueue and
+ * Returns zero if @work was already on the kernel-global workqueue and
- * %true otherwise.
+ * non-zero otherwise.
 *
 * This puts a job in the kernel-global workqueue if it was not already
 * queued and leaves it in the same position on the kernel-global
 * workqueue otherwise.
 */
-bool schedule_work(struct work_struct *work)
+int schedule_work(struct work_struct *work)
 {
        return queue_work(system_wq, work);
 }
 EXPORT_SYMBOL(schedule_work);
-/**
+/*
- * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
+ * schedule_work_on - put work task on a specific cpu
- * @cpu: cpu to use
+ * @cpu: cpu to put the work task on
- * @dwork: job to be done
+ * @work: job to be done
- * @delay: number of jiffies to wait
 *
- * After waiting for a given time this puts a job in the kernel-global
+ * This puts a job on a specific cpu
- * workqueue on the specified CPU.
 */
-bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
+int schedule_work_on(int cpu, struct work_struct *work)
-                              unsigned long delay)
 {
-        return queue_delayed_work_on(cpu, system_wq, dwork, delay);
+        return queue_work_on(cpu, system_wq, work);
 }
-EXPORT_SYMBOL(schedule_delayed_work_on);
+EXPORT_SYMBOL(schedule_work_on);
 /**
 * schedule_delayed_work - put work task in global workqueue after delay
@@ -3067,13 +2766,30 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue.
 */
-bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
+int schedule_delayed_work(struct delayed_work *dwork,
+                                        unsigned long delay)
 {
        return queue_delayed_work(system_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 /**
+ * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
+ * @cpu: cpu to use
+ * @dwork: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue on the specified CPU.
+ */
+int schedule_delayed_work_on(int cpu,
+                        struct delayed_work *dwork, unsigned long delay)
+{
+        return queue_delayed_work_on(cpu, system_wq, dwork, delay);
+}
+EXPORT_SYMBOL(schedule_delayed_work_on);
+/**
 * schedule_on_each_cpu - execute a function synchronously on each online CPU
 * @func: the function to call
 *
@@ -3181,8 +2897,13 @@ static int alloc_cwqs(struct workqueue_struct *wq)
        const size_t size = sizeof(struct cpu_workqueue_struct);
        const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
                                   __alignof__(unsigned long long));
+#ifdef CONFIG_SMP
+        bool percpu = !(wq->flags & WQ_UNBOUND);
+#else
+        bool percpu = false;
+#endif
-        if (!(wq->flags & WQ_UNBOUND))
+        if (percpu)
                wq->cpu_wq.pcpu = __alloc_percpu(size, align);
        else {
                void *ptr;
@@ -3206,7 +2927,13 @@ static int alloc_cwqs(struct workqueue_struct *wq)
 static void free_cwqs(struct workqueue_struct *wq)
 {
-        if (!(wq->flags & WQ_UNBOUND))
+#ifdef CONFIG_SMP
+        bool percpu = !(wq->flags & WQ_UNBOUND);
+#else
+        bool percpu = false;
+#endif
+        if (percpu)
                free_percpu(wq->cpu_wq.pcpu);
        else if (wq->cpu_wq.single) {
                /* the pointer to free is stored right after the cwq */
@@ -3220,35 +2947,21 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
        int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
        if (max_active < 1 || max_active > lim)
-                pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
+                printk(KERN_WARNING "workqueue: max_active %d requested for %s "
-                        max_active, name, 1, lim);
+                       "is out of range, clamping between %d and %d\n",
+                       max_active, name, 1, lim);
        return clamp_val(max_active, 1, lim);
 }
-struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
+struct workqueue_struct *__alloc_workqueue_key(const char *name,
                                               unsigned int flags,
                                               int max_active,
                                               struct lock_class_key *key,
-                                               const char *lock_name, ...)
+                                               const char *lock_name)
 {
-        va_list args, args1;
        struct workqueue_struct *wq;
        unsigned int cpu;
-        size_t namelen;
-        /* determine namelen, allocate wq and format name */
-        va_start(args, lock_name);
-        va_copy(args1, args);
-        namelen = vsnprintf(NULL, 0, fmt, args) + 1;
-        wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
-        if (!wq)
-                goto err;
-        vsnprintf(wq->name, namelen, fmt, args1);
-        va_end(args);
-        va_end(args1);
        /*
         * Workqueues which may be used during memory reclaim should
@@ -3257,10 +2970,20 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        if (flags & WQ_MEM_RECLAIM)
                flags |= WQ_RESCUER;
+        /*
+         * Unbound workqueues aren't concurrency managed and should be
+         * dispatched to workers immediately.
+         */
+        if (flags & WQ_UNBOUND)
+                flags |= WQ_HIGHPRI;
        max_active = max_active ?: WQ_DFL_ACTIVE;
-        max_active = wq_clamp_max_active(max_active, flags, wq->name);
+        max_active = wq_clamp_max_active(max_active, flags, name);
+        wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+        if (!wq)
+                goto err;
-        /* init wq */
        wq->flags = flags;
        wq->saved_max_active = max_active;
        mutex_init(&wq->flush_mutex);
@@ -3268,6 +2991,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        INIT_LIST_HEAD(&wq->flusher_queue);
        INIT_LIST_HEAD(&wq->flusher_overflow);
+        wq->name = name;
        lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
        INIT_LIST_HEAD(&wq->list);
@@ -3277,10 +3001,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        for_each_cwq_cpu(cpu, wq) {
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
                struct global_cwq *gcwq = get_gcwq(cpu);
-                int pool_idx = (bool)(flags & WQ_HIGHPRI);
                BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
-                cwq->pool = &gcwq->pools[pool_idx];
+                cwq->gcwq = gcwq;
                cwq->wq = wq;
                cwq->flush_color = -1;
                cwq->max_active = max_active;
@@ -3297,8 +3020,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                if (!rescuer)
                        goto err;
-                rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+                rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
-                                               wq->name);
                if (IS_ERR(rescuer->task))
                        goto err;
@@ -3377,26 +3099,6 @@ void destroy_workqueue(struct workqueue_struct *wq)
 EXPORT_SYMBOL_GPL(destroy_workqueue);
 /**
- * cwq_set_max_active - adjust max_active of a cwq
- * @cwq: target cpu_workqueue_struct
- * @max_active: new max_active value.
- *
- * Set @cwq->max_active to @max_active and activate delayed works if
- * increased.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- */
-static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active)
-{
-        cwq->max_active = max_active;
-        while (!list_empty(&cwq->delayed_works) &&
-               cwq->nr_active < cwq->max_active)
-                cwq_activate_first_delayed(cwq);
-}
-/**
 * workqueue_set_max_active - adjust max_active of a workqueue
 * @wq: target workqueue
 * @max_active: new max_active value.
@@ -3423,7 +3125,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
                if (!(wq->flags & WQ_FREEZABLE) ||
                    !(gcwq->flags & GCWQ_FREEZING))
-                        cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active);
+                        get_cwq(gcwq->cpu, wq)->max_active = max_active;
                spin_unlock_irq(&gcwq->lock);
        }
@@ -3487,7 +3189,7 @@ unsigned int work_busy(struct work_struct *work)
        unsigned int ret = 0;
        if (!gcwq)
-                return 0;
+                return false;
        spin_lock_irqsave(&gcwq->lock, flags);
@@ -3512,159 +3214,386 @@ EXPORT_SYMBOL_GPL(work_busy);
 * gcwqs serve mix of short, long and very long running works making
 * blocked draining impractical.
 *
- * This is solved by allowing a gcwq to be disassociated from the CPU
+ * This is solved by allowing a gcwq to be detached from CPU, running
- * running as an unbound one and allowing it to be reattached later if the
+ * it with unbound (rogue) workers and allowing it to be reattached
- * cpu comes back online.
+ * later if the cpu comes back online.  A separate thread is created
+ * to govern a gcwq in such state and is called the trustee of the
+ * gcwq.
+ *
+ * Trustee states and their descriptions.
+ *
+ * START        Command state used on startup.  On CPU_DOWN_PREPARE, a
+ *              new trustee is started with this state.
+ *
+ * IN_CHARGE    Once started, trustee will enter this state after
+ *              assuming the manager role and making all existing
+ *              workers rogue.  DOWN_PREPARE waits for trustee to
+ *              enter this state.  After reaching IN_CHARGE, trustee
+ *              tries to execute the pending worklist until it's empty
+ *              and the state is set to BUTCHER, or the state is set
+ *              to RELEASE.
+ *
+ * BUTCHER      Command state which is set by the cpu callback after
+ *              the cpu has went down.  Once this state is set trustee
+ *              knows that there will be no new works on the worklist
+ *              and once the worklist is empty it can proceed to
+ *              killing idle workers.
+ *
+ * RELEASE      Command state which is set by the cpu callback if the
+ *              cpu down has been canceled or it has come online
+ *              again.  After recognizing this state, trustee stops
+ *              trying to drain or butcher and clears ROGUE, rebinds
+ *              all remaining workers back to the cpu and releases
+ *              manager role.
+ *
+ * DONE         Trustee will enter this state after BUTCHER or RELEASE
+ *              is complete.
+ *
+ *          trustee                 CPU                draining
+ *         took over                down               complete
+ * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
+ *                        |                     |                  ^
+ *                        | CPU is back online  v   return workers |
+ *                         ----------------> RELEASE --------------
 */
-/* claim manager positions of all pools */
+/**
-static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
+ * trustee_wait_event_timeout - timed event wait for trustee
-{
+ * @cond: condition to wait for
-        struct worker_pool *pool;
+ * @timeout: timeout in jiffies
+ *
-        for_each_worker_pool(pool, gcwq)
+ * wait_event_timeout() for trustee to use.  Handles locking and
-                mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
+ * checks for RELEASE request.
-        spin_lock_irq(&gcwq->lock);
+ *
-}
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
-/* release manager positions */
+ * multiple times.  To be used by trustee.
-static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
+ *
-{
+ * RETURNS:
-        struct worker_pool *pool;
+ * Positive indicating left time if @cond is satisfied, 0 if timed
+ * out, -1 if canceled.
+ */
+#define trustee_wait_event_timeout(cond, timeout) ({                    \
+        long __ret = (timeout);                                         \
+        while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
+               __ret) {                                                 \
+                spin_unlock_irq(&gcwq->lock);                           \
+                __wait_event_timeout(gcwq->trustee_wait, (cond) ||      \
+                        (gcwq->trustee_state == TRUSTEE_RELEASE),       \
+                        __ret);                                         \
+                spin_lock_irq(&gcwq->lock);                             \
+        }                                                               \
+        gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);          \
+})
-        spin_unlock_irq(&gcwq->lock);
+/**
-        for_each_worker_pool(pool, gcwq)
+ * trustee_wait_event - event wait for trustee
-                mutex_unlock(&pool->assoc_mutex);
+ * @cond: condition to wait for
-}
+ *
+ * wait_event() for trustee to use.  Automatically handles locking and
+ * checks for CANCEL request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by trustee.
+ *
+ * RETURNS:
+ * 0 if @cond is satisfied, -1 if canceled.
+ */
+#define trustee_wait_event(cond) ({                                     \
+        long __ret1;                                                    \
+        __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
+        __ret1 < 0 ? -1 : 0;                                            \
+})
-static void gcwq_unbind_fn(struct work_struct *work)
+static int __cpuinit trustee_thread(void *__gcwq)
 {
-        struct global_cwq *gcwq = get_gcwq(smp_processor_id());
+        struct global_cwq *gcwq = __gcwq;
-        struct worker_pool *pool;
        struct worker *worker;
+        struct work_struct *work;
        struct hlist_node *pos;
+        long rc;
        int i;
        BUG_ON(gcwq->cpu != smp_processor_id());
-        gcwq_claim_assoc_and_lock(gcwq);
+        spin_lock_irq(&gcwq->lock);
        /*
-         * We've claimed all manager positions.  Make all workers unbound
+         * Claim the manager position and make all workers rogue.
-         * and set DISASSOCIATED.  Before this, all workers except for the
+         * Trustee must be bound to the target cpu and can't be
-         * ones which are still executing works from before the last CPU
+         * cancelled.
-         * down must be on the cpu.  After this, they may become diasporas.
         */
-        for_each_worker_pool(pool, gcwq)
+        BUG_ON(gcwq->cpu != smp_processor_id());
-                list_for_each_entry(worker, &pool->idle_list, entry)
+        rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
-                        worker->flags |= WORKER_UNBOUND;
+        BUG_ON(rc < 0);
-        for_each_busy_worker(worker, i, pos, gcwq)
+        gcwq->flags |= GCWQ_MANAGING_WORKERS;
-                worker->flags |= WORKER_UNBOUND;
-        gcwq->flags |= GCWQ_DISASSOCIATED;
+        list_for_each_entry(worker, &gcwq->idle_list, entry)
+                worker->flags |= WORKER_ROGUE;
-        gcwq_release_assoc_and_unlock(gcwq);
+        for_each_busy_worker(worker, i, pos, gcwq)
+                worker->flags |= WORKER_ROGUE;
        /*
-         * Call schedule() so that we cross rq->lock and thus can guarantee
+         * Call schedule() so that we cross rq->lock and thus can
-         * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary
+         * guarantee sched callbacks see the rogue flag.  This is
-         * as scheduler callbacks may be invoked from other cpus.
+         * necessary as scheduler callbacks may be invoked from other
+         * cpus.
         */
+        spin_unlock_irq(&gcwq->lock);
        schedule();
+        spin_lock_irq(&gcwq->lock);
        /*
-         * Sched callbacks are disabled now.  Zap nr_running.  After this,
+         * Sched callbacks are disabled now.  Zap nr_running.  After
-         * nr_running stays zero and need_more_worker() and keep_working()
+         * this, nr_running stays zero and need_more_worker() and
-         * are always true as long as the worklist is not empty.  @gcwq now
+         * keep_working() are always true as long as the worklist is
-         * behaves as unbound (in terms of concurrency management) gcwq
+         * not empty.
-         * which is served by workers tied to the CPU.
+         */
-         *
+        atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
-         * On return from this function, the current worker would trigger
-         * unbound chain execution of pending work items if other workers
+        spin_unlock_irq(&gcwq->lock);
-         * didn't already.
+        del_timer_sync(&gcwq->idle_timer);
+        spin_lock_irq(&gcwq->lock);
+        /*
+         * We're now in charge.  Notify and proceed to drain.  We need
+         * to keep the gcwq running during the whole CPU down
+         * procedure as other cpu hotunplug callbacks may need to
+         * flush currently running tasks.
+         */
+        gcwq->trustee_state = TRUSTEE_IN_CHARGE;
+        wake_up_all(&gcwq->trustee_wait);
+        /*
+         * The original cpu is in the process of dying and may go away
+         * anytime now.  When that happens, we and all workers would
+         * be migrated to other cpus.  Try draining any left work.  We
+         * want to get it over with ASAP - spam rescuers, wake up as
+         * many idlers as necessary and create new ones till the
+         * worklist is empty.  Note that if the gcwq is frozen, there
+         * may be frozen works in freezable cwqs.  Don't declare
+         * completion while frozen.
+         */
+        while (gcwq->nr_workers != gcwq->nr_idle ||
+               gcwq->flags & GCWQ_FREEZING ||
+               gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
+                int nr_works = 0;
+                list_for_each_entry(work, &gcwq->worklist, entry) {
+                        send_mayday(work);
+                        nr_works++;
+                }
+                list_for_each_entry(worker, &gcwq->idle_list, entry) {
+                        if (!nr_works--)
+                                break;
+                        wake_up_process(worker->task);
+                }
+                if (need_to_create_worker(gcwq)) {
+                        spin_unlock_irq(&gcwq->lock);
+                        worker = create_worker(gcwq, false);
+                        spin_lock_irq(&gcwq->lock);
+                        if (worker) {
+                                worker->flags |= WORKER_ROGUE;
+                                start_worker(worker);
+                        }
+                }
+                /* give a breather */
+                if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
+                        break;
+        }
+        /*
+         * Either all works have been scheduled and cpu is down, or
+         * cpu down has already been canceled.  Wait for and butcher
+         * all workers till we're canceled.
         */
-        for_each_worker_pool(pool, gcwq)
+        do {
-                atomic_set(get_pool_nr_running(pool), 0);
+                rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
+                while (!list_empty(&gcwq->idle_list))
+                        destroy_worker(list_first_entry(&gcwq->idle_list,
+                                                        struct worker, entry));
+        } while (gcwq->nr_workers && rc >= 0);
+        /*
+         * At this point, either draining has completed and no worker
+         * is left, or cpu down has been canceled or the cpu is being
+         * brought back up.  There shouldn't be any idle one left.
+         * Tell the remaining busy ones to rebind once it finishes the
+         * currently scheduled works by scheduling the rebind_work.
+         */
+        WARN_ON(!list_empty(&gcwq->idle_list));
+        for_each_busy_worker(worker, i, pos, gcwq) {
+                struct work_struct *rebind_work = &worker->rebind_work;
+                /*
+                 * Rebind_work may race with future cpu hotplug
+                 * operations.  Use a separate flag to mark that
+                 * rebinding is scheduled.
+                 */
+                worker->flags |= WORKER_REBIND;
+                worker->flags &= ~WORKER_ROGUE;
+                /* queue rebind_work, wq doesn't matter, use the default one */
+                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+                                     work_data_bits(rebind_work)))
+                        continue;
+                debug_work_activate(rebind_work);
+                insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
+                            worker->scheduled.next,
+                            work_color_to_flags(WORK_NO_COLOR));
+        }
+        /* relinquish manager role */
+        gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+        /* notify completion */
+        gcwq->trustee = NULL;
+        gcwq->trustee_state = TRUSTEE_DONE;
+        wake_up_all(&gcwq->trustee_wait);
+        spin_unlock_irq(&gcwq->lock);
+        return 0;
 }
-/*
+/**
- * Workqueues should be brought up before normal priority CPU notifiers.
+ * wait_trustee_state - wait for trustee to enter the specified state
- * This will be registered high priority CPU notifier.
+ * @gcwq: gcwq the trustee of interest belongs to
+ * @state: target state to wait for
+ *
+ * Wait for the trustee to reach @state.  DONE is already matched.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by cpu_callback.
 */
-static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
-                                               unsigned long action,
+__releases(&gcwq->lock)
-                                               void *hcpu)
+__acquires(&gcwq->lock)
+{
+        if (!(gcwq->trustee_state == state ||
+              gcwq->trustee_state == TRUSTEE_DONE)) {
+                spin_unlock_irq(&gcwq->lock);
+                __wait_event(gcwq->trustee_wait,
+                             gcwq->trustee_state == state ||
+                             gcwq->trustee_state == TRUSTEE_DONE);
+                spin_lock_irq(&gcwq->lock);
+        }
+}
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
+                                                unsigned long action,
+                                                void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
        struct global_cwq *gcwq = get_gcwq(cpu);
-        struct worker_pool *pool;
+        struct task_struct *new_trustee = NULL;
+        struct worker *uninitialized_var(new_worker);
+        unsigned long flags;
+        action &= ~CPU_TASKS_FROZEN;
-        switch (action & ~CPU_TASKS_FROZEN) {
+        switch (action) {
+        case CPU_DOWN_PREPARE:
+                new_trustee = kthread_create(trustee_thread, gcwq,
+                                             "workqueue_trustee/%d\n", cpu);
+                if (IS_ERR(new_trustee))
+                        return notifier_from_errno(PTR_ERR(new_trustee));
+                kthread_bind(new_trustee, cpu);
+                /* fall through */
        case CPU_UP_PREPARE:
-                for_each_worker_pool(pool, gcwq) {
+                BUG_ON(gcwq->first_idle);
-                        struct worker *worker;
+                new_worker = create_worker(gcwq, false);
+                if (!new_worker) {
+                        if (new_trustee)
+                                kthread_stop(new_trustee);
+                        return NOTIFY_BAD;
+                }
+        }
-                        if (pool->nr_workers)
+        /* some are called w/ irq disabled, don't disturb irq status */
-                                continue;
+        spin_lock_irqsave(&gcwq->lock, flags);
-                        worker = create_worker(pool);
+        switch (action) {
-                        if (!worker)
+        case CPU_DOWN_PREPARE:
-                                return NOTIFY_BAD;
+                /* initialize trustee and tell it to acquire the gcwq */
+                BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
+                gcwq->trustee = new_trustee;
+                gcwq->trustee_state = TRUSTEE_START;
+                wake_up_process(gcwq->trustee);
+                wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
+                /* fall through */
+        case CPU_UP_PREPARE:
+                BUG_ON(gcwq->first_idle);
+                gcwq->first_idle = new_worker;
+                break;
-                        spin_lock_irq(&gcwq->lock);
+        case CPU_DYING:
-                        start_worker(worker);
+                /*
-                        spin_unlock_irq(&gcwq->lock);
+                 * Before this, the trustee and all workers except for
-                }
+                 * the ones which are still executing works from
+                 * before the last CPU down must be on the cpu.  After
+                 * this, they'll all be diasporas.
+                 */
+                gcwq->flags |= GCWQ_DISASSOCIATED;
+                break;
+        case CPU_POST_DEAD:
+                gcwq->trustee_state = TRUSTEE_BUTCHER;
+                /* fall through */
+        case CPU_UP_CANCELED:
+                destroy_worker(gcwq->first_idle);
+                gcwq->first_idle = NULL;
                break;
        case CPU_DOWN_FAILED:
        case CPU_ONLINE:
-                gcwq_claim_assoc_and_lock(gcwq);
                gcwq->flags &= ~GCWQ_DISASSOCIATED;
-                rebind_workers(gcwq);
+                if (gcwq->trustee_state != TRUSTEE_DONE) {
-                gcwq_release_assoc_and_unlock(gcwq);
+                        gcwq->trustee_state = TRUSTEE_RELEASE;
+                        wake_up_process(gcwq->trustee);
+                        wait_trustee_state(gcwq, TRUSTEE_DONE);
+                }
+                /*
+                 * Trustee is done and there might be no worker left.
+                 * Put the first_idle in and request a real manager to
+                 * take a look.
+                 */
+                spin_unlock_irq(&gcwq->lock);
+                kthread_bind(gcwq->first_idle->task, cpu);
+                spin_lock_irq(&gcwq->lock);
+                gcwq->flags |= GCWQ_MANAGE_WORKERS;
+                start_worker(gcwq->first_idle);
+                gcwq->first_idle = NULL;
                break;
        }
-        return NOTIFY_OK;
-}
-/*
+        spin_unlock_irqrestore(&gcwq->lock, flags);
- * Workqueues should be brought down after normal priority CPU notifiers.
- * This will be registered as low priority CPU notifier.
- */
-static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
-                                                 unsigned long action,
-                                                 void *hcpu)
-{
-        unsigned int cpu = (unsigned long)hcpu;
-        struct work_struct unbind_work;
-        switch (action & ~CPU_TASKS_FROZEN) {
+        return notifier_from_errno(0);
-        case CPU_DOWN_PREPARE:
-                /* unbinding should happen on the local CPU */
-                INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
-                queue_work_on(cpu, system_highpri_wq, &unbind_work);
-                flush_work(&unbind_work);
-                break;
-        }
-        return NOTIFY_OK;
 }
 #ifdef CONFIG_SMP
 struct work_for_cpu {
-        struct work_struct work;
+        struct completion completion;
        long (*fn)(void *);
        void *arg;
        long ret;
 };
-static void work_for_cpu_fn(struct work_struct *work)
+static int do_work_for_cpu(void *_wfc)
 {
-        struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
+        struct work_for_cpu *wfc = _wfc;
        wfc->ret = wfc->fn(wfc->arg);
+        complete(&wfc->completion);
+        return 0;
 }
 /**
@@ -3679,11 +3608,19 @@ static void work_for_cpu_fn(struct work_struct *work)
 */
 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 {
-        struct work_for_cpu wfc = { .fn = fn, .arg = arg };
+        struct task_struct *sub_thread;
+        struct work_for_cpu wfc = {
+                .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
+                .fn = fn,
+                .arg = arg,
+        };
-        INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+        sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
-        schedule_work_on(cpu, &wfc.work);
+        if (IS_ERR(sub_thread))
-        flush_work(&wfc.work);
+                return PTR_ERR(sub_thread);
+        kthread_bind(sub_thread, cpu);
+        wake_up_process(sub_thread);
+        wait_for_completion(&wfc.completion);
        return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -3798,7 +3735,6 @@ void thaw_workqueues(void)
        for_each_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
-                struct worker_pool *pool;
                struct workqueue_struct *wq;
                spin_lock_irq(&gcwq->lock);
@@ -3813,11 +3749,14 @@ void thaw_workqueues(void)
                                continue;
                        /* restore max_active and repopulate worklist */
-                        cwq_set_max_active(cwq, wq->saved_max_active);
+                        cwq->max_active = wq->saved_max_active;
+                        while (!list_empty(&cwq->delayed_works) &&
+                               cwq->nr_active < cwq->max_active)
+                                cwq_activate_first_delayed(cwq);
                }
-                for_each_worker_pool(pool, gcwq)
+                wake_up_worker(gcwq);
-                        wake_up_worker(pool);
                spin_unlock_irq(&gcwq->lock);
        }
@@ -3833,69 +3772,56 @@ static int __init init_workqueues(void)
        unsigned int cpu;
        int i;
-        /* make sure we have enough bits for OFFQ CPU number */
+        cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
-        BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) <
-                     WORK_CPU_LAST);
-        cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
-        hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
        /* initialize gcwqs */
        for_each_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
-                struct worker_pool *pool;
                spin_lock_init(&gcwq->lock);
+                INIT_LIST_HEAD(&gcwq->worklist);
                gcwq->cpu = cpu;
                gcwq->flags |= GCWQ_DISASSOCIATED;
+                INIT_LIST_HEAD(&gcwq->idle_list);
                for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
                        INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
-                for_each_worker_pool(pool, gcwq) {
+                init_timer_deferrable(&gcwq->idle_timer);
-                        pool->gcwq = gcwq;
+                gcwq->idle_timer.function = idle_worker_timeout;
-                        INIT_LIST_HEAD(&pool->worklist);
+                gcwq->idle_timer.data = (unsigned long)gcwq;
-                        INIT_LIST_HEAD(&pool->idle_list);
-                        init_timer_deferrable(&pool->idle_timer);
+                setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
-                        pool->idle_timer.function = idle_worker_timeout;
+                            (unsigned long)gcwq);
-                        pool->idle_timer.data = (unsigned long)pool;
-                        setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
+                ida_init(&gcwq->worker_ida);
-                                    (unsigned long)pool);
-                        mutex_init(&pool->assoc_mutex);
+                gcwq->trustee_state = TRUSTEE_DONE;
-                        ida_init(&pool->worker_ida);
+                init_waitqueue_head(&gcwq->trustee_wait);
-                }
        }
        /* create the initial worker */
        for_each_online_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
-                struct worker_pool *pool;
+                struct worker *worker;
                if (cpu != WORK_CPU_UNBOUND)
                        gcwq->flags &= ~GCWQ_DISASSOCIATED;
+                worker = create_worker(gcwq, true);
-                for_each_worker_pool(pool, gcwq) {
+                BUG_ON(!worker);
-                        struct worker *worker;
+                spin_lock_irq(&gcwq->lock);
+                start_worker(worker);
-                        worker = create_worker(pool);
+                spin_unlock_irq(&gcwq->lock);
-                        BUG_ON(!worker);
-                        spin_lock_irq(&gcwq->lock);
-                        start_worker(worker);
-                        spin_unlock_irq(&gcwq->lock);
-                }
        }
        system_wq = alloc_workqueue("events", 0, 0);
-        system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
        system_long_wq = alloc_workqueue("events_long", 0, 0);
+        system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                                            WQ_UNBOUND_MAX_ACTIVE);
        system_freezable_wq = alloc_workqueue("events_freezable",
                                              WQ_FREEZABLE, 0);
-        BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
+        BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
               !system_unbound_wq || !system_freezable_wq);
        return 0;
 }
author	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-17 16:15:55 -0500
committer	Jonathan Herman <hermanjl@cs.unc.edu>	2013-01-17 16:15:55 -0500
commit	8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
tree	a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /kernel
parent	406089d01562f1e2bf9f089fd7637009ebaad589 (diff)