70 files changed, 2218 insertions, 2580 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c072b6da239..bbde5f1a4486 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
            sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
            rcupdate.o extable.o params.o posix-timers.o \
-            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+            kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o cred.o \
            async.o range.o groups.o lglock.o smpboot.o
@@ -25,9 +25,7 @@ endif
 obj-y += sched/
 obj-y += power/
-ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
+obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
-obj-$(CONFIG_X86) += kcmp.o
-endif
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -127,11 +125,19 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE
 $(obj)/time.o: $(obj)/timeconst.h
-quiet_cmd_timeconst  = TIMEC   $@
+quiet_cmd_hzfile = HZFILE  $@
-      cmd_timeconst  = $(PERL) $< $(CONFIG_HZ) > $@
+      cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
+targets += hz.bc
+$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
+        $(call if_changed,hzfile)
+quiet_cmd_bc  = BC      $@
+      cmd_bc  = bc -q $(filter-out FORCE,$^) > $@
 targets += timeconst.h
-$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
+$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
-        $(call if_changed,timeconst)
+        $(call if_changed,bc)
 ifeq ($(CONFIG_MODULE_SIG),y)
 #
@@ -153,23 +159,7 @@ kernel/modsign_certificate.o: signing_key.x509 extra_certificates
 # fail and that the kernel may be used afterwards.
 #
 ###############################################################################
-sign_key_with_hash :=
+ifndef CONFIG_MODULE_SIG_HASH
-ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
-sign_key_with_hash := -sha1
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
-sign_key_with_hash := -sha224
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
-sign_key_with_hash := -sha256
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
-sign_key_with_hash := -sha384
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
-sign_key_with_hash := -sha512
-endif
-ifeq ($(sign_key_with_hash),)
 $(error Could not determine digest type to use from kernel config)
 endif
@@ -182,8 +172,8 @@ signing_key.priv signing_key.x509: x509.genkey
        @echo "### needs to be run as root, and uses a hardware random"
        @echo "### number generator if one is available."
        @echo "###"
-        openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \
+        openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
-                -x509 -config x509.genkey \
+                -batch -x509 -config x509.genkey \
                -outform DER -out signing_key.x509 \
                -keyout signing_key.priv
        @echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index e8b1627ab9c7..b9bd7f098ee5 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname)
        if (IS_ERR(file))
                return PTR_ERR(file);
-        if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
+        if (!S_ISREG(file_inode(file)->i_mode)) {
                filp_close(file, NULL);
                return -EACCES;
        }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892798fd..a32f9432666c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
 #include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
 #include <linux/namei.h>
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
@@ -376,22 +376,18 @@ static int css_set_count;
 * account cgroups in empty hierarchies.
 */
 #define CSS_SET_HASH_BITS       7
-#define CSS_SET_TABLE_SIZE      (1 << CSS_SET_HASH_BITS)
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 {
        int i;
-        int index;
+        unsigned long key = 0UL;
-        unsigned long tmp = 0UL;
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
-                tmp += (unsigned long)css[i];
+                key += (unsigned long)css[i];
-        tmp = (tmp >> 16) ^ tmp;
+        key = (key >> 16) ^ key;
-        index = hash_long(tmp, CSS_SET_HASH_BITS);
+        return key;
-        return &css_set_table[index];
 }
 /* We don't maintain the lists running through each css_set to its
@@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
        }
        /* This css_set is dead. unlink it and release cgroup refcounts */
-        hlist_del(&cg->hlist);
+        hash_del(&cg->hlist);
        css_set_count--;
        list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
                struct cgroup *cgrp = link->cgrp;
                list_del(&link->cg_link_list);
                list_del(&link->cgrp_link_list);
+                /*
+                 * We may not be holding cgroup_mutex, and if cgrp->count is
+                 * dropped to 0 the cgroup can be destroyed at any time, hence
+                 * rcu_read_lock is used to keep it alive.
+                 */
+                rcu_read_lock();
                if (atomic_dec_and_test(&cgrp->count) &&
                    notify_on_release(cgrp)) {
                        if (taskexit)
                                set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
+                rcu_read_unlock();
                kfree(link);
        }
@@ -550,9 +554,8 @@ static struct css_set *find_existing_css_set(
 {
        int i;
        struct cgroupfs_root *root = cgrp->root;
-        struct hlist_head *hhead;
-        struct hlist_node *node;
        struct css_set *cg;
+        unsigned long key;
        /*
         * Build the set of subsystem state objects that we want to see in the
@@ -572,8 +575,8 @@ static struct css_set *find_existing_css_set(
                }
        }
-        hhead = css_set_hash(template);
+        key = css_set_hash(template);
-        hlist_for_each_entry(cg, node, hhead, hlist) {
+        hash_for_each_possible(css_set_table, cg, hlist, key) {
                if (!compare_css_sets(cg, oldcg, cgrp, template))
                        continue;
@@ -657,8 +660,8 @@ static struct css_set *find_css_set(
        struct list_head tmp_cg_links;
-        struct hlist_head *hhead;
        struct cg_cgroup_link *link;
+        unsigned long key;
        /* First see if we already have a cgroup group that matches
         * the desired set */
@@ -704,8 +707,8 @@ static struct css_set *find_css_set(
        css_set_count++;
        /* Add this cgroup group to the hash table */
-        hhead = css_set_hash(res->subsys);
+        key = css_set_hash(res->subsys);
-        hlist_add_head(&res->hlist, hhead);
+        hash_add(css_set_table, &res->hlist, key);
        write_unlock(&css_set_lock);
@@ -856,47 +859,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
        return inode;
 }
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+static void cgroup_free_fn(struct work_struct *work)
 {
-        /* is dentry a directory ? if so, kfree() associated cgroup */
+        struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
-        if (S_ISDIR(inode->i_mode)) {
+        struct cgroup_subsys *ss;
-                struct cgroup *cgrp = dentry->d_fsdata;
-                struct cgroup_subsys *ss;
-                BUG_ON(!(cgroup_is_removed(cgrp)));
-                /* It's possible for external users to be holding css
-                 * reference counts on a cgroup; css_put() needs to
-                 * be able to access the cgroup after decrementing
-                 * the reference count in order to know if it needs to
-                 * queue the cgroup to be handled by the release
-                 * agent */
-                synchronize_rcu();
-                mutex_lock(&cgroup_mutex);
+        mutex_lock(&cgroup_mutex);
-                /*
+        /*
-                 * Release the subsystem state objects.
+         * Release the subsystem state objects.
-                 */
+         */
-                for_each_subsys(cgrp->root, ss)
+        for_each_subsys(cgrp->root, ss)
-                        ss->css_free(cgrp);
+                ss->css_free(cgrp);
-                cgrp->root->number_of_cgroups--;
+        cgrp->root->number_of_cgroups--;
-                mutex_unlock(&cgroup_mutex);
+        mutex_unlock(&cgroup_mutex);
-                /*
+        /*
-                 * Drop the active superblock reference that we took when we
+         * Drop the active superblock reference that we took when we
-                 * created the cgroup
+         * created the cgroup
-                 */
+         */
-                deactivate_super(cgrp->root->sb);
+        deactivate_super(cgrp->root->sb);
-                /*
+        /*
-                 * if we're getting rid of the cgroup, refcount should ensure
+         * if we're getting rid of the cgroup, refcount should ensure
-                 * that there are no pidlists left.
+         * that there are no pidlists left.
-                 */
+         */
-                BUG_ON(!list_empty(&cgrp->pidlists));
+        BUG_ON(!list_empty(&cgrp->pidlists));
+        simple_xattrs_free(&cgrp->xattrs);
+        ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+        kfree(cgrp);
+}
-                simple_xattrs_free(&cgrp->xattrs);
+static void cgroup_free_rcu(struct rcu_head *head)
+{
+        struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+        schedule_work(&cgrp->free_work);
+}
+static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+{
+        /* is dentry a directory ? if so, kfree() associated cgroup */
+        if (S_ISDIR(inode->i_mode)) {
+                struct cgroup *cgrp = dentry->d_fsdata;
-                ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+                BUG_ON(!(cgroup_is_removed(cgrp)));
-                kfree_rcu(cgrp, rcu_head);
+                call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
        } else {
                struct cfent *cfe = __d_cfe(dentry);
                struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -925,13 +935,17 @@ static void remove_dir(struct dentry *d)
        dput(parent);
 }
-static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 {
        struct cfent *cfe;
        lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
        lockdep_assert_held(&cgroup_mutex);
+        /*
+         * If we're doing cleanup due to failure of cgroup_create(),
+         * the corresponding @cfe may not exist.
+         */
        list_for_each_entry(cfe, &cgrp->files, node) {
                struct dentry *d = cfe->dentry;
@@ -944,9 +958,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
                list_del_init(&cfe->node);
                dput(d);
-                return 0;
+                break;
        }
-        return -ENOENT;
 }
 /**
@@ -1083,7 +1096,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                }
        }
        root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
-        synchronize_rcu();
        return 0;
 }
@@ -1393,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->allcg_node);
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
+        INIT_WORK(&cgrp->free_work, cgroup_free_fn);
        mutex_init(&cgrp->pidlist_mutex);
        INIT_LIST_HEAD(&cgrp->event_list);
        spin_lock_init(&cgrp->event_list_lock);
@@ -1597,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                struct cgroupfs_root *existing_root;
                const struct cred *cred;
                int i;
+                struct css_set *cg;
                BUG_ON(sb->s_root != NULL);
@@ -1650,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                /* Link the top cgroup in this hierarchy into all
                 * the css_set objects */
                write_lock(&css_set_lock);
-                for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+                hash_for_each(css_set_table, i, cg, hlist)
-                        struct hlist_head *hhead = &css_set_table[i];
+                        link_css_set(&tmp_cg_links, cg, root_cgrp);
-                        struct hlist_node *node;
-                        struct css_set *cg;
-                        hlist_for_each_entry(cg, node, hhead, hlist)
-                                link_css_set(&tmp_cg_links, cg, root_cgrp);
-                }
                write_unlock(&css_set_lock);
                free_cg_links(&tmp_cg_links);
@@ -1773,7 +1781,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
        rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
                           "cgroup_path() called without proper locking");
-        if (!dentry || cgrp == dummytop) {
+        if (cgrp == dummytop) {
                /*
                 * Inactive subsystems have no dentry for their root
                 * cgroup
@@ -1982,7 +1990,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                        ss->attach(cgrp, &tset);
        }
-        synchronize_rcu();
 out:
        if (retval) {
                for_each_subsys(root, ss) {
@@ -2151,7 +2158,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
        /*
         * step 5: success! and cleanup
         */
-        synchronize_rcu();
        retval = 0;
 out_put_css_set_refs:
        if (retval) {
@@ -2637,7 +2643,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un
 */
 static inline struct cftype *__file_cft(struct file *file)
 {
-        if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+        if (file_inode(file)->i_fop != &cgroup_file_operations)
                return ERR_PTR(-EINVAL);
        return __d_cft(file->f_dentry);
 }
@@ -2769,14 +2775,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
                if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
                        continue;
-                if (is_add)
+                if (is_add) {
                        err = cgroup_add_file(cgrp, subsys, cft);
-                else
+                        if (err)
-                        err = cgroup_rm_file(cgrp, cft);
+                                pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
-                if (err) {
+                                        cft->name, err);
-                        pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
-                                   is_add ? "add" : "remove", cft->name, err);
                        ret = err;
+                } else {
+                        cgroup_rm_file(cgrp, cft);
                }
        }
        return ret;
@@ -3017,6 +3023,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 }
 EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+/**
+ * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
+ * @pos: cgroup of interest
+ *
+ * Return the rightmost descendant of @pos.  If there's no descendant,
+ * @pos is returned.  This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ */
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+{
+        struct cgroup *last, *tmp;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        do {
+                last = pos;
+                /* ->prev isn't RCU safe, walk ->next till the end */
+                pos = NULL;
+                list_for_each_entry_rcu(tmp, &last->children, sibling)
+                        pos = tmp;
+        } while (pos);
+        return last;
+}
+EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
 static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
 {
        struct cgroup *last;
@@ -3752,8 +3784,13 @@ static void cgroup_event_remove(struct work_struct *work)
                        remove);
        struct cgroup *cgrp = event->cgrp;
+        remove_wait_queue(event->wqh, &event->wait);
        event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+        /* Notify userspace the event is going away. */
+        eventfd_signal(event->eventfd, 1);
        eventfd_ctx_put(event->eventfd);
        kfree(event);
        dput(cgrp->dentry);
@@ -3773,15 +3810,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
        unsigned long flags = (unsigned long)key;
        if (flags & POLLHUP) {
-                __remove_wait_queue(event->wqh, &event->wait);
-                spin_lock(&cgrp->event_list_lock);
-                list_del_init(&event->list);
-                spin_unlock(&cgrp->event_list_lock);
                /*
-                 * We are in atomic context, but cgroup_event_remove() may
+                 * If the event has been detached at cgroup removal, we
-                 * sleep, so we have to call it in workqueue.
+                 * can simply return knowing the other side will cleanup
+                 * for us.
+                 *
+                 * We can't race against event freeing since the other
+                 * side will require wqh->lock via remove_wait_queue(),
+                 * which we hold.
                 */
-                schedule_work(&event->remove);
+                spin_lock(&cgrp->event_list_lock);
+                if (!list_empty(&event->list)) {
+                        list_del_init(&event->list);
+                        /*
+                         * We are in atomic context, but cgroup_event_remove()
+                         * may sleep, so we have to call it in workqueue.
+                         */
+                        schedule_work(&event->remove);
+                }
+                spin_unlock(&cgrp->event_list_lock);
        }
        return 0;
@@ -3807,6 +3854,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
                                      const char *buffer)
 {
        struct cgroup_event *event = NULL;
+        struct cgroup *cgrp_cfile;
        unsigned int efd, cfd;
        struct file *efile = NULL;
        struct file *cfile = NULL;
@@ -3852,7 +3900,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
        /* the process need read permission on control file */
        /* AV: shouldn't we check that it's been opened for read instead? */
-        ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
+        ret = inode_permission(file_inode(cfile), MAY_READ);
        if (ret < 0)
                goto fail;
@@ -3862,6 +3910,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
                goto fail;
        }
+        /*
+         * The file to be monitored must be in the same cgroup as
+         * cgroup.event_control is.
+         */
+        cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
+        if (cgrp_cfile != cgrp) {
+                ret = -EINVAL;
+                goto fail;
+        }
        if (!event->cft->register_event || !event->cft->unregister_event) {
                ret = -EINVAL;
                goto fail;
@@ -4135,6 +4193,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        init_cgroup_housekeeping(cgrp);
+        dentry->d_fsdata = cgrp;
+        cgrp->dentry = dentry;
        cgrp->parent = parent;
        cgrp->root = parent->root;
        cgrp->top_cgroup = parent->top_cgroup;
@@ -4172,8 +4233,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        lockdep_assert_held(&dentry->d_inode->i_mutex);
        /* allocation complete, commit to creation */
-        dentry->d_fsdata = cgrp;
-        cgrp->dentry = dentry;
        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
        list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
        root->number_of_cgroups++;
@@ -4340,20 +4399,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        /*
         * Unregister events and notify userspace.
         * Notify userspace about cgroup removing only after rmdir of cgroup
-         * directory to avoid race between userspace and kernelspace. Use
+         * directory to avoid race between userspace and kernelspace.
-         * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
-         * cgroup_event_wake() is called with the wait queue head locked,
-         * remove_wait_queue() cannot be called while holding event_list_lock.
         */
        spin_lock(&cgrp->event_list_lock);
-        list_splice_init(&cgrp->event_list, &tmp_list);
+        list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
-        spin_unlock(&cgrp->event_list_lock);
-        list_for_each_entry_safe(event, tmp, &tmp_list, list) {
                list_del_init(&event->list);
-                remove_wait_queue(event->wqh, &event->wait);
-                eventfd_signal(event->eventfd, 1);
                schedule_work(&event->remove);
        }
+        spin_unlock(&cgrp->event_list_lock);
        return 0;
 }
@@ -4438,6 +4491,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 {
        struct cgroup_subsys_state *css;
        int i, ret;
+        struct hlist_node *tmp;
+        struct css_set *cg;
+        unsigned long key;
        /* check name and function validity */
        if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4503,23 +4559,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         * this is all done under the css_set_lock.
         */
        write_lock(&css_set_lock);
-        for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+        hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
-                struct css_set *cg;
+                /* skip entries that we already rehashed */
-                struct hlist_node *node, *tmp;
+                if (cg->subsys[ss->subsys_id])
-                struct hlist_head *bucket = &css_set_table[i], *new_bucket;
+                        continue;
+                /* remove existing entry */
-                hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
+                hash_del(&cg->hlist);
-                        /* skip entries that we already rehashed */
+                /* set new value */
-                        if (cg->subsys[ss->subsys_id])
+                cg->subsys[ss->subsys_id] = css;
-                                continue;
+                /* recompute hash and restore entry */
-                        /* remove existing entry */
+                key = css_set_hash(cg->subsys);
-                        hlist_del(&cg->hlist);
+                hash_add(css_set_table, &cg->hlist, key);
-                        /* set new value */
-                        cg->subsys[ss->subsys_id] = css;
-                        /* recompute hash and restore entry */
-                        new_bucket = css_set_hash(cg->subsys);
-                        hlist_add_head(&cg->hlist, new_bucket);
-                }
        }
        write_unlock(&css_set_lock);
@@ -4551,7 +4601,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
 void cgroup_unload_subsys(struct cgroup_subsys *ss)
 {
        struct cg_cgroup_link *link;
-        struct hlist_head *hhead;
        BUG_ON(ss->module == NULL);
@@ -4567,10 +4616,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        offline_css(ss, dummytop);
        ss->active = 0;
-        if (ss->use_id) {
+        if (ss->use_id)
-                idr_remove_all(&ss->idr);
                idr_destroy(&ss->idr);
-        }
        /* deassign the subsys_id */
        subsys[ss->subsys_id] = NULL;
@@ -4585,11 +4632,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        write_lock(&css_set_lock);
        list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
                struct css_set *cg = link->cg;
+                unsigned long key;
-                hlist_del(&cg->hlist);
+                hash_del(&cg->hlist);
                cg->subsys[ss->subsys_id] = NULL;
-                hhead = css_set_hash(cg->subsys);
+                key = css_set_hash(cg->subsys);
-                hlist_add_head(&cg->hlist, hhead);
+                hash_add(css_set_table, &cg->hlist, key);
        }
        write_unlock(&css_set_lock);
@@ -4631,9 +4679,6 @@ int __init cgroup_init_early(void)
        list_add(&init_css_set_link.cg_link_list,
                 &init_css_set.cg_links);
-        for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
-                INIT_HLIST_HEAD(&css_set_table[i]);
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
@@ -4667,7 +4712,7 @@ int __init cgroup_init(void)
 {
        int err;
        int i;
-        struct hlist_head *hhead;
+        unsigned long key;
        err = bdi_init(&cgroup_backing_dev_info);
        if (err)
@@ -4686,8 +4731,8 @@ int __init cgroup_init(void)
        }
        /* Add init_css_set to the hash table */
-        hhead = css_set_hash(init_css_set.subsys);
+        key = css_set_hash(init_css_set.subsys);
-        hlist_add_head(&init_css_set.hlist, hhead);
+        hash_add(css_set_table, &init_css_set.hlist, key);
        BUG_ON(!init_root_id(&rootnode));
        cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4982,8 +5027,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        }
        task_unlock(tsk);
-        if (cg)
+        put_css_set_taskexit(cg);
-                put_css_set_taskexit(cg);
 }
 /**
@@ -5274,7 +5318,7 @@ EXPORT_SYMBOL_GPL(free_css_id);
 static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
 {
        struct css_id *newid;
-        int myid, error, size;
+        int ret, size;
        BUG_ON(!ss->use_id);
@@ -5282,35 +5326,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
        newid = kzalloc(size, GFP_KERNEL);
        if (!newid)
                return ERR_PTR(-ENOMEM);
-        /* get id */
-        if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
+        idr_preload(GFP_KERNEL);
-                error = -ENOMEM;
-                goto err_out;
-        }
        spin_lock(&ss->id_lock);
        /* Don't use 0. allocates an ID of 1-65535 */
-        error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+        ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
        spin_unlock(&ss->id_lock);
+        idr_preload_end();
        /* Returns error when there are no free spaces for new ID.*/
-        if (error) {
+        if (ret < 0)
-                error = -ENOSPC;
                goto err_out;
-        }
-        if (myid > CSS_ID_MAX)
-                goto remove_idr;
-        newid->id = myid;
+        newid->id = ret;
        newid->depth = depth;
        return newid;
-remove_idr:
-        error = -ENOSPC;
-        spin_lock(&ss->id_lock);
-        idr_remove(&ss->idr, myid);
-        spin_unlock(&ss->id_lock);
 err_out:
        kfree(newid);
-        return ERR_PTR(error);
+        return ERR_PTR(ret);
 }
@@ -5441,7 +5474,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
        struct inode *inode;
        struct cgroup_subsys_state *css;
-        inode = f->f_dentry->d_inode;
+        inode = file_inode(f);
        /* check in cgroup filesystem dir */
        if (inode->i_op != &cgroup_dir_inode_operations)
                return ERR_PTR(-EBADF);
diff --git a/kernel/compat.c b/kernel/compat.c
index 36700e9e2be9..19971d8c7299 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
                 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
 }
-asmlinkage long compat_sys_getitimer(int which,
+COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
-                struct compat_itimerval __user *it)
+                struct compat_itimerval __user *, it)
 {
        struct itimerval kit;
        int error;
@@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,
        return error;
 }
-asmlinkage long compat_sys_setitimer(int which,
+COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
-                struct compat_itimerval __user *in,
+                struct compat_itimerval __user *, in,
-                struct compat_itimerval __user *out)
+                struct compat_itimerval __user *, out)
 {
        struct itimerval kin, kout;
        int error;
@@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
        memcpy(blocked->sig, &set, sizeof(set));
 }
-asmlinkage long compat_sys_sigprocmask(int how,
+COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
-                                       compat_old_sigset_t __user *nset,
+                       compat_old_sigset_t __user *, nset,
-                                       compat_old_sigset_t __user *oset)
+                       compat_old_sigset_t __user *, oset)
 {
        old_sigset_t old_set, new_set;
        sigset_t new_blocked;
@@ -593,7 +593,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,
                else
                        ret = put_compat_rusage(&ru, uru);
                if (ret)
-                        return ret;
+                        return -EFAULT;
        }
        BUG_ON(info.si_code & __SI_MASK);
@@ -971,7 +971,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
 }
 void
-sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
+sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
 {
        switch (_NSIG_WORDS) {
        case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
@@ -982,10 +982,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
 }
 EXPORT_SYMBOL_GPL(sigset_from_compat);
-asmlinkage long
+void
-compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
+sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
-                struct compat_siginfo __user *uinfo,
+{
-                struct compat_timespec __user *uts, compat_size_t sigsetsize)
+        switch (_NSIG_WORDS) {
+        case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
+        case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
+        case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
+        case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+        }
+}
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+                struct compat_siginfo __user *, uinfo,
+                struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
 {
        compat_sigset_t s32;
        sigset_t s;
@@ -1013,18 +1023,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
        }
        return ret;
-}
-asmlinkage long
-compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
-                             struct compat_siginfo __user *uinfo)
-{
-        siginfo_t info;
-        if (copy_siginfo_from_user32(&info, uinfo))
-                return -EFAULT;
-        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
 }
 #ifdef __ARCH_WANT_COMPAT_SYS_TIME
@@ -1067,23 +1065,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
 #endif /* __ARCH_WANT_COMPAT_SYS_TIME */
-#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
-asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
-{
-        sigset_t newset;
-        compat_sigset_t newset32;
-        /* XXX: Don't preclude handling different sized sigset_t's.  */
-        if (sigsetsize != sizeof(sigset_t))
-                return -EINVAL;
-        if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
-                return -EFAULT;
-        sigset_from_compat(&newset, &newset32);
-        return sigsuspend(&newset);
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 {
        struct timex txc;
@@ -1222,9 +1203,9 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
        return 0;
 }
-#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
-asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
+                       compat_pid_t, pid,
-                                                 struct compat_timespec __user *interval)
+                       struct compat_timespec __user *, interval)
 {
        struct timespec t;
        int ret;
@@ -1237,7 +1218,6 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
                return -EFAULT;
        return ret;
 }
-#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */
 /*
 * Allocate user-space memory for the duration of a single system call,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7bb63eea6eb8..4f9dfe43ecbd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,14 +61,6 @@
 #include <linux/cgroup.h>
 /*
- * Workqueue for cpuset related tasks.
- *
- * Using kevent workqueue may cause deadlock when memory_migrate
- * is set. So we create a separate workqueue thread for cpuset.
- */
-static struct workqueue_struct *cpuset_wq;
-/*
 * Tracks how many cpusets are currently defined in system.
 * When there is only one cpuset (the root cpuset) we can
 * short circuit some hooks.
@@ -95,18 +87,21 @@ struct cpuset {
        cpumask_var_t cpus_allowed;     /* CPUs allowed to tasks in cpuset */
        nodemask_t mems_allowed;        /* Memory Nodes allowed to tasks */
-        struct cpuset *parent;          /* my parent */
        struct fmeter fmeter;           /* memory_pressure filter */
+        /*
+         * Tasks are being attached to this cpuset.  Used to prevent
+         * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+         */
+        int attach_in_progress;
        /* partition number for rebuild_sched_domains() */
        int pn;
        /* for custom sched domain */
        int relax_domain_level;
-        /* used for walking a cpuset hierarchy */
+        struct work_struct hotplug_work;
-        struct list_head stack_list;
 };
 /* Retrieve the cpuset for a cgroup */
@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
                            struct cpuset, css);
 }
+static inline struct cpuset *parent_cs(const struct cpuset *cs)
+{
+        struct cgroup *pcgrp = cs->css.cgroup->parent;
+        if (pcgrp)
+                return cgroup_cs(pcgrp);
+        return NULL;
+}
 #ifdef CONFIG_NUMA
 static inline bool task_has_mempolicy(struct task_struct *task)
 {
@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
 /* bits in struct cpuset flags field */
 typedef enum {
+        CS_ONLINE,
        CS_CPU_EXCLUSIVE,
        CS_MEM_EXCLUSIVE,
        CS_MEM_HARDWALL,
@@ -147,13 +152,12 @@ typedef enum {
        CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
-/* the type of hotplug event */
-enum hotplug_event {
-        CPUSET_CPU_OFFLINE,
-        CPUSET_MEM_OFFLINE,
-};
 /* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+        return test_bit(CS_ONLINE, &cs->flags);
+}
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
        return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)
 }
 static struct cpuset top_cpuset = {
-        .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+        .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+                  (1 << CS_MEM_EXCLUSIVE)),
 };
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_cgrp: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs.  Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs)            \
+        cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup)      \
+                if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs.  Must be used
+ * with RCU read locked.  The caller may modify @pos_cgrp by calling
+ * cgroup_rightmost_descendant() to skip subtree.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs)       \
+        cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
+                if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
 /*
- * There are two global mutexes guarding cpuset structures.  The first
+ * There are two global mutexes guarding cpuset structures - cpuset_mutex
- * is the main control groups cgroup_mutex, accessed via
+ * and callback_mutex.  The latter may nest inside the former.  We also
- * cgroup_lock()/cgroup_unlock().  The second is the cpuset-specific
+ * require taking task_lock() when dereferencing a task's cpuset pointer.
- * callback_mutex, below. They can nest.  It is ok to first take
+ * See "The task_lock() exception", at the end of this comment.
- * cgroup_mutex, then nest callback_mutex.  We also require taking
+ *
- * task_lock() when dereferencing a task's cpuset pointer.  See "The
+ * A task must hold both mutexes to modify cpusets.  If a task holds
- * task_lock() exception", at the end of this comment.
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
- *
+ * is the only task able to also acquire callback_mutex and be able to
- * A task must hold both mutexes to modify cpusets.  If a task
+ * modify cpusets.  It can perform various checks on the cpuset structure
- * holds cgroup_mutex, then it blocks others wanting that mutex,
+ * first, knowing nothing will change.  It can also allocate memory while
- * ensuring that it is the only task able to also acquire callback_mutex
+ * just holding cpuset_mutex.  While it is performing these checks, various
- * and be able to modify cpusets.  It can perform various checks on
+ * callback routines can briefly acquire callback_mutex to query cpusets.
- * the cpuset structure first, knowing nothing will change.  It can
+ * Once it is ready to make the changes, it takes callback_mutex, blocking
- * also allocate memory while just holding cgroup_mutex.  While it is
+ * everyone else.
- * performing these checks, various callback routines can briefly
- * acquire callback_mutex to query cpusets.  Once it is ready to make
- * the changes, it takes callback_mutex, blocking everyone else.
 *
 * Calls to the kernel memory allocator can not be made while holding
 * callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = {
 * guidelines for accessing subsystem state in kernel/cgroup.c
 */
+static DEFINE_MUTEX(cpuset_mutex);
 static DEFINE_MUTEX(callback_mutex);
 /*
@@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
 static DEFINE_SPINLOCK(cpuset_buffer_lock);
 /*
+ * CPU / memory hotplug is handled asynchronously.
+ */
+static struct workqueue_struct *cpuset_propagate_hotplug_wq;
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+/*
 * This is ugly, but preserves the userspace API for existing cpuset
 * users. If someone tries to mount the "cpuset" filesystem, we
 * silently switch it to mount "cgroup" instead
@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
                                  struct cpumask *pmask)
 {
        while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
-                cs = cs->parent;
+                cs = parent_cs(cs);
        if (cs)
                cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
        else
@@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 {
        while (cs && !nodes_intersects(cs->mems_allowed,
                                        node_states[N_MEMORY]))
-                cs = cs->parent;
+                cs = parent_cs(cs);
        if (cs)
                nodes_and(*pmask, cs->mems_allowed,
                                        node_states[N_MEMORY]);
@@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 /*
 * update task's spread flag if cpuset's page/slab spread flag is set
 *
- * Called with callback_mutex/cgroup_mutex held
+ * Called with callback_mutex/cpuset_mutex held
 */
 static void cpuset_update_task_spread_flag(struct cpuset *cs,
                                        struct task_struct *tsk)
@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.  Call holding cgroup_mutex.
+ * are only set if the other's are set.  Call holding cpuset_mutex.
 */
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial)
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
- * cgroup_mutex held.
+ * cpuset_mutex held.
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 {
        struct cgroup *cont;
        struct cpuset *c, *par;
+        int ret;
+        rcu_read_lock();
        /* Each of our child cpusets must be a subset of us */
-        list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
+        ret = -EBUSY;
-                if (!is_cpuset_subset(cgroup_cs(cont), trial))
+        cpuset_for_each_child(c, cont, cur)
-                        return -EBUSY;
+                if (!is_cpuset_subset(c, trial))
-        }
+                        goto out;
        /* Remaining checks don't apply to root cpuset */
+        ret = 0;
        if (cur == &top_cpuset)
-                return 0;
+                goto out;
-        par = cur->parent;
+        par = parent_cs(cur);
        /* We must be a subset of our parent cpuset */
+        ret = -EACCES;
        if (!is_cpuset_subset(trial, par))
-                return -EACCES;
+                goto out;
        /*
         * If either I or some sibling (!= me) is exclusive, we can't
         * overlap
         */
-        list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
+        ret = -EINVAL;
-                c = cgroup_cs(cont);
+        cpuset_for_each_child(c, cont, par) {
                if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
                    c != cur &&
                    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
-                        return -EINVAL;
+                        goto out;
                if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
                    c != cur &&
                    nodes_intersects(trial->mems_allowed, c->mems_allowed))
-                        return -EINVAL;
+                        goto out;
        }
-        /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
+        /*
-        if (cgroup_task_count(cur->css.cgroup)) {
+         * Cpusets with tasks - existing or newly being attached - can't
-                if (cpumask_empty(trial->cpus_allowed) ||
+         * have empty cpus_allowed or mems_allowed.
-                    nodes_empty(trial->mems_allowed)) {
+         */
-                        return -ENOSPC;
+        ret = -ENOSPC;
-                }
+        if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
-        }
+            (cpumask_empty(trial->cpus_allowed) ||
+             nodes_empty(trial->mems_allowed)))
+                goto out;
-        return 0;
+        ret = 0;
+out:
+        rcu_read_unlock();
+        return ret;
 }
 #ifdef CONFIG_SMP
@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
        return;
 }
-static void
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+                                    struct cpuset *root_cs)
 {
-        LIST_HEAD(q);
+        struct cpuset *cp;
+        struct cgroup *pos_cgrp;
-        list_add(&c->stack_list, &q);
-        while (!list_empty(&q)) {
-                struct cpuset *cp;
-                struct cgroup *cont;
-                struct cpuset *child;
-                cp = list_first_entry(&q, struct cpuset, stack_list);
-                list_del(q.next);
-                if (cpumask_empty(cp->cpus_allowed))
+        rcu_read_lock();
+        cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+                /* skip the whole subtree if @cp doesn't have any CPU */
+                if (cpumask_empty(cp->cpus_allowed)) {
+                        pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
                        continue;
+                }
                if (is_sched_load_balance(cp))
                        update_domain_attr(dattr, cp);
-                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
-                        child = cgroup_cs(cont);
-                        list_add_tail(&child->stack_list, &q);
-                }
        }
+        rcu_read_unlock();
 }
 /*
@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 *
- * Must be called with cgroup_lock held.
+ * Must be called with cpuset_mutex held.
 *
 * The three key local variables below are:
 *    q  - a linked-list queue of cpuset pointers, used to implement a
@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 static int generate_sched_domains(cpumask_var_t **domains,
                        struct sched_domain_attr **attributes)
 {
-        LIST_HEAD(q);           /* queue of cpusets to be scanned */
        struct cpuset *cp;      /* scans q */
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;          /* number of sched domains in result */
        int nslot;              /* next empty doms[] struct cpumask slot */
+        struct cgroup *pos_cgrp;
        doms = NULL;
        dattr = NULL;
@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
                goto done;
        csn = 0;
-        list_add(&top_cpuset.stack_list, &q);
+        rcu_read_lock();
-        while (!list_empty(&q)) {
+        cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
-                struct cgroup *cont;
-                struct cpuset *child;   /* scans child cpusets of cp */
-                cp = list_first_entry(&q, struct cpuset, stack_list);
-                list_del(q.next);
-                if (cpumask_empty(cp->cpus_allowed))
-                        continue;
                /*
-                 * All child cpusets contain a subset of the parent's cpus, so
+                 * Continue traversing beyond @cp iff @cp has some CPUs and
-                 * just skip them, and then we call update_domain_attr_tree()
+                 * isn't load balancing.  The former is obvious.  The
-                 * to calc relax_domain_level of the corresponding sched
+                 * latter: All child cpusets contain a subset of the
-                 * domain.
+                 * parent's cpus, so just skip them, and then we call
+                 * update_domain_attr_tree() to calc relax_domain_level of
+                 * the corresponding sched domain.
                 */
-                if (is_sched_load_balance(cp)) {
+                if (!cpumask_empty(cp->cpus_allowed) &&
-                        csa[csn++] = cp;
+                    !is_sched_load_balance(cp))
                        continue;
-                }
-                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                if (is_sched_load_balance(cp))
-                        child = cgroup_cs(cont);
+                        csa[csn++] = cp;
-                        list_add_tail(&child->stack_list, &q);
-                }
+                /* skip @cp's subtree */
-        }
+                pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+        }
+        rcu_read_unlock();
        for (i = 0; i < csn; i++)
                csa[i]->pn = i;
@@ -725,25 +763,25 @@ done:
 /*
 * Rebuild scheduler domains.
 *
- * Call with neither cgroup_mutex held nor within get_online_cpus().
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
- * Takes both cgroup_mutex and get_online_cpus().
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
 *
- * Cannot be directly called from cpuset code handling changes
+ * Call with cpuset_mutex held.  Takes get_online_cpus().
- * to the cpuset pseudo-filesystem, because it cannot be called
- * from code that already holds cgroup_mutex.
 */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
 {
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
        int ndoms;
+        lockdep_assert_held(&cpuset_mutex);
        get_online_cpus();
        /* Generate domain masks and attrs */
-        cgroup_lock();
        ndoms = generate_sched_domains(&doms, &attr);
-        cgroup_unlock();
        /* Have scheduler rebuild the domains */
        partition_sched_domains(ndoms, doms, attr);
@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
        put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
 {
 }
@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains,
 }
 #endif /* CONFIG_SMP */
-static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
-/*
- * Rebuild scheduler domains, asynchronously via workqueue.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
- *
- * The rebuild_sched_domains() and partition_sched_domains()
- * routines must nest cgroup_lock() inside get_online_cpus(),
- * but such cpuset changes as these must nest that locking the
- * other way, holding cgroup_lock() for much of the code.
- *
- * So in order to avoid an ABBA deadlock, the cpuset code handling
- * these user changes delegates the actual sched domain rebuilding
- * to a separate workqueue thread, which ends up processing the
- * above do_rebuild_sched_domains() function.
- */
-static void async_rebuild_sched_domains(void)
-{
-        queue_work(cpuset_wq, &rebuild_sched_domains_work);
-}
-/*
- * Accomplishes the same scheduler domain rebuild as the above
- * async_rebuild_sched_domains(), however it directly calls the
- * rebuild routine synchronously rather than calling it via an
- * asynchronous work thread.
- *
- * This can only be called from code that is not holding
- * cgroup_mutex (not nested in a cgroup_lock() call.)
- */
 void rebuild_sched_domains(void)
 {
-        do_rebuild_sched_domains(NULL);
+        mutex_lock(&cpuset_mutex);
+        rebuild_sched_domains_locked();
+        mutex_unlock(&cpuset_mutex);
 }
 /**
@@ -808,7 +813,7 @@ void rebuild_sched_domains(void)
 * @tsk: task to test
 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
 *
- * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Call with cpuset_mutex held.  May take callback_mutex during call.
 * Called for each task in a cgroup by cgroup_scan_tasks().
 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
 * words, if its mask is not equal to its cpuset's mask).
@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
 * cpus_allowed mask needs to be changed.
 *
 * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
 */
 static void cpuset_change_cpumask(struct task_struct *tsk,
                                  struct cgroup_scanner *scan)
@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * calling callback functions for each.
@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
        heap_free(&heap);
        if (is_load_balanced)
-                async_rebuild_sched_domains();
+                rebuild_sched_domains_locked();
        return 0;
 }
@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 *    Temporarilly set tasks mems_allowed to target nodes of migration,
 *    so that the migration code can allocate pages on these nodes.
 *
- *    Call holding cgroup_mutex, so current's cpuset won't change
+ *    Call holding cpuset_mutex, so current's cpuset won't change
 *    during this call, as manage_mutex holds off any cpuset_attach()
 *    calls.  Therefore we don't need to take task_lock around the
 *    call to guarantee_online_mems(), as we know no one is changing
@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 /*
 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cgroup_mutex held.
+ * memory_migrate flag is set. Called with cpuset_mutex held.
 */
 static void cpuset_change_nodemask(struct task_struct *p,
                                   struct cgroup_scanner *scan)
@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
        struct cpuset *cs;
        int migrate;
        const nodemask_t *oldmem = scan->data;
-        static nodemask_t newmems;      /* protected by cgroup_mutex */
+        static nodemask_t newmems;      /* protected by cpuset_mutex */
        cs = cgroup_cs(scan->cg);
        guarantee_online_mems(cs, &newmems);
@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound;
 * @oldmem: old mems_allowed of cpuset cs
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
 * if @heap != NULL.
 */
@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
         * take while holding tasklist_lock.  Forks can happen - the
         * mpol_dup() cpuset_being_rebound check will catch such forks,
         * and rebind their vma mempolicies too.  Because we still hold
-         * the global cgroup_mutex, we know that no other rebind effort
+         * the global cpuset_mutex, we know that no other rebind effort
         * will be contending for the global variable cpuset_being_rebound.
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 * mempolicies and if the cpuset is marked 'memory_migrate',
 * migrate the tasks pages to the new memory.
 *
- * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Call with cpuset_mutex held.  May take callback_mutex during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
                cs->relax_domain_level = val;
                if (!cpumask_empty(cs->cpus_allowed) &&
                    is_sched_load_balance(cs))
-                        async_rebuild_sched_domains();
+                        rebuild_sched_domains_locked();
        }
        return 0;
@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 * Called by cgroup_scan_tasks() for each task in a cgroup.
 *
 * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
 */
 static void cpuset_change_flag(struct task_struct *tsk,
                                struct cgroup_scanner *scan)
@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
 * @cs: the cpuset in which each task's spread flags needs to be changed
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * calling callback functions for each.
@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
 * cs:          the cpuset to update
 * turning_on:  whether the flag is being set or cleared
 *
- * Call with cgroup_mutex held.
+ * Call with cpuset_mutex held.
 */
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        mutex_unlock(&callback_mutex);
        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-                async_rebuild_sched_domains();
+                rebuild_sched_domains_locked();
        if (spread_flag_changed)
                update_tasks_flags(cs, &heap);
@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp)
        return val;
 }
-/*
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
- * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in can_attach, and they must
- * persist until attach.
- */
-static cpumask_var_t cpus_attach;
-static nodemask_t cpuset_attach_nodemask_from;
-static nodemask_t cpuset_attach_nodemask_to;
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
 static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
        struct cpuset *cs = cgroup_cs(cgrp);
        struct task_struct *task;
        int ret;
+        mutex_lock(&cpuset_mutex);
+        ret = -ENOSPC;
        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
-                return -ENOSPC;
+                goto out_unlock;
        cgroup_taskset_for_each(task, cgrp, tset) {
                /*
@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
                 * set_cpus_allowed_ptr() on all attached tasks before
                 * cpus_allowed may be changed.
                 */
+                ret = -EINVAL;
                if (task->flags & PF_THREAD_BOUND)
-                        return -EINVAL;
+                        goto out_unlock;
-                if ((ret = security_task_setscheduler(task)))
+                ret = security_task_setscheduler(task);
-                        return ret;
+                if (ret)
+                        goto out_unlock;
        }
-        /* prepare for attach */
+        /*
-        if (cs == &top_cpuset)
+         * Mark attach is in progress.  This makes validate_change() fail
-                cpumask_copy(cpus_attach, cpu_possible_mask);
+         * changes which zero cpus/mems_allowed.
-        else
+         */
-                guarantee_online_cpus(cs, cpus_attach);
+        cs->attach_in_progress++;
+        ret = 0;
-        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+out_unlock:
+        mutex_unlock(&cpuset_mutex);
+        return ret;
+}
-        return 0;
+static void cpuset_cancel_attach(struct cgroup *cgrp,
+                                 struct cgroup_taskset *tset)
+{
+        mutex_lock(&cpuset_mutex);
+        cgroup_cs(cgrp)->attach_in_progress--;
+        mutex_unlock(&cpuset_mutex);
 }
+/*
+ * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there.  Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
 static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
+        /* static bufs protected by cpuset_mutex */
+        static nodemask_t cpuset_attach_nodemask_from;
+        static nodemask_t cpuset_attach_nodemask_to;
        struct mm_struct *mm;
        struct task_struct *task;
        struct task_struct *leader = cgroup_taskset_first(tset);
@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
        struct cpuset *cs = cgroup_cs(cgrp);
        struct cpuset *oldcs = cgroup_cs(oldcgrp);
+        mutex_lock(&cpuset_mutex);
+        /* prepare for attach */
+        if (cs == &top_cpuset)
+                cpumask_copy(cpus_attach, cpu_possible_mask);
+        else
+                guarantee_online_cpus(cs, cpus_attach);
+        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
        cgroup_taskset_for_each(task, cgrp, tset) {
                /*
                 * can_attach beforehand should guarantee that this doesn't
@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
                                          &cpuset_attach_nodemask_to);
                mmput(mm);
        }
+        cs->attach_in_progress--;
+        /*
+         * We may have raced with CPU/memory hotunplug.  Trigger hotplug
+         * propagation if @cs doesn't have any CPU or memory.  It will move
+         * the newly added tasks to the nearest parent which can execute.
+         */
+        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+                schedule_cpuset_propagate_hotplug(cs);
+        mutex_unlock(&cpuset_mutex);
 }
 /* The various types of files and directories in a cpuset file system */
@@ -1469,12 +1510,13 @@ typedef enum {
 static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
-        int retval = 0;
        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
+        int retval = -ENODEV;
-        if (!cgroup_lock_live_group(cgrp))
+        mutex_lock(&cpuset_mutex);
-                return -ENODEV;
+        if (!is_cpuset_online(cs))
+                goto out_unlock;
        switch (type) {
        case FILE_CPU_EXCLUSIVE:
@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
                retval = -EINVAL;
                break;
        }
-        cgroup_unlock();
+out_unlock:
+        mutex_unlock(&cpuset_mutex);
        return retval;
 }
 static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 {
-        int retval = 0;
        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
+        int retval = -ENODEV;
-        if (!cgroup_lock_live_group(cgrp))
+        mutex_lock(&cpuset_mutex);
-                return -ENODEV;
+        if (!is_cpuset_online(cs))
+                goto out_unlock;
        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
                retval = -EINVAL;
                break;
        }
-        cgroup_unlock();
+out_unlock:
+        mutex_unlock(&cpuset_mutex);
        return retval;
 }
@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
                                const char *buf)
 {
-        int retval = 0;
        struct cpuset *cs = cgroup_cs(cgrp);
        struct cpuset *trialcs;
+        int retval = -ENODEV;
+        /*
+         * CPU or memory hotunplug may leave @cs w/o any execution
+         * resources, in which case the hotplug code asynchronously updates
+         * configuration and transfers all tasks to the nearest ancestor
+         * which can execute.
+         *
+         * As writes to "cpus" or "mems" may restore @cs's execution
+         * resources, wait for the previously scheduled operations before
+         * proceeding, so that we don't end up keep removing tasks added
+         * after execution capability is restored.
+         *
+         * Flushing cpuset_hotplug_work is enough to synchronize against
+         * hotplug hanlding; however, cpuset_attach() may schedule
+         * propagation work directly.  Flush the workqueue too.
+         */
+        flush_work(&cpuset_hotplug_work);
+        flush_workqueue(cpuset_propagate_hotplug_wq);
-        if (!cgroup_lock_live_group(cgrp))
+        mutex_lock(&cpuset_mutex);
-                return -ENODEV;
+        if (!is_cpuset_online(cs))
+                goto out_unlock;
        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs) {
                retval = -ENOMEM;
-                goto out;
+                goto out_unlock;
        }
        switch (cft->private) {
@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
        }
        free_trial_cpuset(trialcs);
-out:
+out_unlock:
-        cgroup_unlock();
+        mutex_unlock(&cpuset_mutex);
        return retval;
 }
@@ -1790,15 +1854,12 @@ static struct cftype files[] = {
 static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
 {
-        struct cgroup *parent_cg = cont->parent;
+        struct cpuset *cs;
-        struct cgroup *tmp_cg;
-        struct cpuset *parent, *cs;
-        if (!parent_cg)
+        if (!cont->parent)
                return &top_cpuset.css;
-        parent = cgroup_cs(parent_cg);
-        cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+        cs = kzalloc(sizeof(*cs), GFP_KERNEL);
        if (!cs)
                return ERR_PTR(-ENOMEM);
        if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
                return ERR_PTR(-ENOMEM);
        }
-        cs->flags = 0;
-        if (is_spread_page(parent))
-                set_bit(CS_SPREAD_PAGE, &cs->flags);
-        if (is_spread_slab(parent))
-                set_bit(CS_SPREAD_SLAB, &cs->flags);
        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        cpumask_clear(cs->cpus_allowed);
        nodes_clear(cs->mems_allowed);
        fmeter_init(&cs->fmeter);
+        INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
        cs->relax_domain_level = -1;
-        cs->parent = parent;
+        return &cs->css;
+}
+static int cpuset_css_online(struct cgroup *cgrp)
+{
+        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *parent = parent_cs(cs);
+        struct cpuset *tmp_cs;
+        struct cgroup *pos_cg;
+        if (!parent)
+                return 0;
+        mutex_lock(&cpuset_mutex);
+        set_bit(CS_ONLINE, &cs->flags);
+        if (is_spread_page(parent))
+                set_bit(CS_SPREAD_PAGE, &cs->flags);
+        if (is_spread_slab(parent))
+                set_bit(CS_SPREAD_SLAB, &cs->flags);
        number_of_cpusets++;
-        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
+        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
-                goto skip_clone;
+                goto out_unlock;
        /*
         * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
@@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
         * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
         * (and likewise for mems) to the new cgroup.
         */
-        list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
+        rcu_read_lock();
-                struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
+        cpuset_for_each_child(tmp_cs, pos_cg, parent) {
+                if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
-                if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
+                        rcu_read_unlock();
-                        goto skip_clone;
+                        goto out_unlock;
+                }
        }
+        rcu_read_unlock();
        mutex_lock(&callback_mutex);
        cs->mems_allowed = parent->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
        mutex_unlock(&callback_mutex);
-skip_clone:
+out_unlock:
-        return &cs->css;
+        mutex_unlock(&cpuset_mutex);
+        return 0;
+}
+static void cpuset_css_offline(struct cgroup *cgrp)
+{
+        struct cpuset *cs = cgroup_cs(cgrp);
+        mutex_lock(&cpuset_mutex);
+        if (is_sched_load_balance(cs))
+                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+        number_of_cpusets--;
+        clear_bit(CS_ONLINE, &cs->flags);
+        mutex_unlock(&cpuset_mutex);
 }
 /*
 * If the cpuset being removed has its flag 'sched_load_balance'
 * enabled, then simulate turning sched_load_balance off, which
- * will call async_rebuild_sched_domains().
+ * will call rebuild_sched_domains_locked().
 */
 static void cpuset_css_free(struct cgroup *cont)
 {
        struct cpuset *cs = cgroup_cs(cont);
-        if (is_sched_load_balance(cs))
-                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
-        number_of_cpusets--;
        free_cpumask_var(cs->cpus_allowed);
        kfree(cs);
 }
@@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont)
 struct cgroup_subsys cpuset_subsys = {
        .name = "cpuset",
        .css_alloc = cpuset_css_alloc,
+        .css_online = cpuset_css_online,
+        .css_offline = cpuset_css_offline,
        .css_free = cpuset_css_free,
        .can_attach = cpuset_can_attach,
+        .cancel_attach = cpuset_cancel_attach,
        .attach = cpuset_attach,
        .subsys_id = cpuset_subsys_id,
        .base_cftypes = files,
@@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,
 {
        struct cgroup *new_cgroup = scan->data;
+        cgroup_lock();
        cgroup_attach_task(new_cgroup, tsk);
+        cgroup_unlock();
 }
 /**
@@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
 * @from: cpuset in which the tasks currently reside
 * @to: cpuset to which the tasks will be moved
 *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
 * callback_mutex must not be held, as cpuset_attach() will take it.
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
@@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
 * cpuset to its next-highest non-empty parent.
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
 */
 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 {
        struct cpuset *parent;
        /*
-         * The cgroup's css_sets list is in use if there are tasks
-         * in the cpuset; the list is empty if there are none;
-         * the cs->css.refcnt seems always 0.
-         */
-        if (list_empty(&cs->css.cgroup->css_sets))
-                return;
-        /*
         * Find its next-highest non-empty parent, (top cpuset
         * has online cpus, so can't be empty).
         */
-        parent = cs->parent;
+        parent = parent_cs(cs);
        while (cpumask_empty(parent->cpus_allowed) ||
                        nodes_empty(parent->mems_allowed))
-                parent = parent->parent;
+                parent = parent_cs(parent);
        move_member_tasks_to_cpuset(cs, parent);
 }
-/*
+/**
- * Helper function to traverse cpusets.
+ * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
- * It can be used to walk the cpuset tree from top to bottom, completing
+ * @cs: cpuset in interest
- * one layer before dropping down to the next (thus always processing a
+ *
- * node before any of its children).
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
 */
-static struct cpuset *cpuset_next(struct list_head *queue)
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
 {
-        struct cpuset *cp;
+        static cpumask_t off_cpus;
-        struct cpuset *child;   /* scans child cpusets of cp */
+        static nodemask_t off_mems, tmp_mems;
-        struct cgroup *cont;
+        struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
+        bool is_empty;
-        if (list_empty(queue))
+        mutex_lock(&cpuset_mutex);
-                return NULL;
+        cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
+        nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
-        cp = list_first_entry(queue, struct cpuset, stack_list);
+        /* remove offline cpus from @cs */
-        list_del(queue->next);
+        if (!cpumask_empty(&off_cpus)) {
-        list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                mutex_lock(&callback_mutex);
-                child = cgroup_cs(cont);
+                cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
-                list_add_tail(&child->stack_list, queue);
+                mutex_unlock(&callback_mutex);
+                update_tasks_cpumask(cs, NULL);
+        }
+        /* remove offline mems from @cs */
+        if (!nodes_empty(off_mems)) {
+                tmp_mems = cs->mems_allowed;
+                mutex_lock(&callback_mutex);
+                nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+                mutex_unlock(&callback_mutex);
+                update_tasks_nodemask(cs, &tmp_mems, NULL);
        }
-        return cp;
+        is_empty = cpumask_empty(cs->cpus_allowed) ||
+                nodes_empty(cs->mems_allowed);
+        mutex_unlock(&cpuset_mutex);
+        /*
+         * If @cs became empty, move tasks to the nearest ancestor with
+         * execution resources.  This is full cgroup operation which will
+         * also call back into cpuset.  Should be done outside any lock.
+         */
+        if (is_empty)
+                remove_tasks_in_empty_cpuset(cs);
+        /* the following may free @cs, should be the last operation */
+        css_put(&cs->css);
 }
+/**
+ * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
+ * @cs: cpuset of interest
+ *
+ * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
+ * memory masks according to top_cpuset.
+ */
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
+{
+        /*
+         * Pin @cs.  The refcnt will be released when the work item
+         * finishes executing.
+         */
+        if (!css_tryget(&cs->css))
+                return;
-/*
+        /*
- * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
+         * Queue @cs->hotplug_work.  If already pending, lose the css ref.
- * online/offline) and update the cpusets accordingly.
+         * cpuset_propagate_hotplug_wq is ordered and propagation will
- * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
+         * happen in the order this function is called.
- * cpuset must be moved to a parent cpuset.
+         */
+        if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
+                css_put(&cs->css);
+}
+/**
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
 *
- * Called with cgroup_mutex held.  We take callback_mutex to modify
+ * This function is called after either CPU or memory configuration has
- * cpus_allowed and mems_allowed.
+ * changed and updates cpuset accordingly.  The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
 *
- * This walk processes the tree from top to bottom, completing one layer
+ * Non-root cpusets are only affected by offlining.  If any CPUs or memory
- * before dropping down to the next.  It always processes a node before
+ * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
- * any of its children.
+ * descendants.
 *
- * In the case of memory hot-unplug, it will remove nodes from N_MEMORY
+ * Note that CPU offlining during suspend is ignored.  We don't modify
- * if all present pages from a node are offlined.
+ * cpusets across suspend/resume cycles at all.
 */
-static void
+static void cpuset_hotplug_workfn(struct work_struct *work)
-scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
 {
-        LIST_HEAD(queue);
+        static cpumask_t new_cpus, tmp_cpus;
-        struct cpuset *cp;              /* scans cpusets being updated */
+        static nodemask_t new_mems, tmp_mems;
-        static nodemask_t oldmems;      /* protected by cgroup_mutex */
+        bool cpus_updated, mems_updated;
+        bool cpus_offlined, mems_offlined;
-        list_add_tail((struct list_head *)&root->stack_list, &queue);
+        mutex_lock(&cpuset_mutex);
-        switch (event) {
+        /* fetch the available cpus/mems and find out which changed how */
-        case CPUSET_CPU_OFFLINE:
+        cpumask_copy(&new_cpus, cpu_active_mask);
-                while ((cp = cpuset_next(&queue)) != NULL) {
+        new_mems = node_states[N_MEMORY];
-                        /* Continue past cpusets with all cpus online */
+        cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
-                        if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
+        cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
-                                continue;
+                                       &new_cpus);
-                        /* Remove offline cpus from this cpuset. */
+        mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
-                        mutex_lock(&callback_mutex);
+        nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
-                        cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+        mems_offlined = !nodes_empty(tmp_mems);
-                                                        cpu_active_mask);
-                        mutex_unlock(&callback_mutex);
-                        /* Move tasks from the empty cpuset to a parent */
+        /* synchronize cpus_allowed to cpu_active_mask */
-                        if (cpumask_empty(cp->cpus_allowed))
+        if (cpus_updated) {
-                                remove_tasks_in_empty_cpuset(cp);
+                mutex_lock(&callback_mutex);
-                        else
+                cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
-                                update_tasks_cpumask(cp, NULL);
+                mutex_unlock(&callback_mutex);
-                }
+                /* we don't mess with cpumasks of tasks in top_cpuset */
-                break;
+        }
-        case CPUSET_MEM_OFFLINE:
+        /* synchronize mems_allowed to N_MEMORY */
-                while ((cp = cpuset_next(&queue)) != NULL) {
+        if (mems_updated) {
+                tmp_mems = top_cpuset.mems_allowed;
+                mutex_lock(&callback_mutex);
+                top_cpuset.mems_allowed = new_mems;
+                mutex_unlock(&callback_mutex);
+                update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
+        }
-                        /* Continue past cpusets with all mems online */
+        /* if cpus or mems went down, we need to propagate to descendants */
-                        if (nodes_subset(cp->mems_allowed,
+        if (cpus_offlined || mems_offlined) {
-                                        node_states[N_MEMORY]))
+                struct cpuset *cs;
-                                continue;
+                struct cgroup *pos_cgrp;
-                        oldmems = cp->mems_allowed;
+                rcu_read_lock();
+                cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
+                        schedule_cpuset_propagate_hotplug(cs);
+                rcu_read_unlock();
+        }
-                        /* Remove offline mems from this cpuset. */
+        mutex_unlock(&cpuset_mutex);
-                        mutex_lock(&callback_mutex);
-                        nodes_and(cp->mems_allowed, cp->mems_allowed,
-                                                node_states[N_MEMORY]);
-                        mutex_unlock(&callback_mutex);
-                        /* Move tasks from the empty cpuset to a parent */
+        /* wait for propagations to finish */
-                        if (nodes_empty(cp->mems_allowed))
+        flush_workqueue(cpuset_propagate_hotplug_wq);
-                                remove_tasks_in_empty_cpuset(cp);
-                        else
+        /* rebuild sched domains if cpus_allowed has changed */
-                                update_tasks_nodemask(cp, &oldmems, NULL);
+        if (cpus_updated) {
-                }
+                struct sched_domain_attr *attr;
+                cpumask_var_t *doms;
+                int ndoms;
+                mutex_lock(&cpuset_mutex);
+                ndoms = generate_sched_domains(&doms, &attr);
+                mutex_unlock(&cpuset_mutex);
+                partition_sched_domains(ndoms, doms, attr);
        }
 }
-/*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
- * period.  This is necessary in order to make cpusets transparent
- * (of no affect) on systems that are actively using CPU hotplug
- * but making no active use of cpusets.
- *
- * The only exception to this is suspend/resume, where we don't
- * modify cpusets at all.
- *
- * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_active_mask on each CPU hotplug (cpuhp) event.
- *
- * Called within get_online_cpus().  Needs to call cgroup_lock()
- * before calling generate_sched_domains().
- *
- * @cpu_online: Indicates whether this is a CPU online event (true) or
- * a CPU offline event (false).
- */
 void cpuset_update_active_cpus(bool cpu_online)
 {
-        struct sched_domain_attr *attr;
+        /*
-        cpumask_var_t *doms;
+         * We're inside cpu hotplug critical region which usually nests
-        int ndoms;
+         * inside cgroup synchronization.  Bounce actual hotplug processing
+         * to a work item to avoid reverse locking order.
-        cgroup_lock();
+         *
-        mutex_lock(&callback_mutex);
+         * We still need to do partition_sched_domains() synchronously;
-        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
+         * otherwise, the scheduler will get confused and put tasks to the
-        mutex_unlock(&callback_mutex);
+         * dead CPU.  Fall back to the default single domain.
+         * cpuset_hotplug_workfn() will rebuild it as necessary.
-        if (!cpu_online)
+         */
-                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
+        partition_sched_domains(1, NULL, NULL);
+        schedule_work(&cpuset_hotplug_work);
-        ndoms = generate_sched_domains(&doms, &attr);
-        cgroup_unlock();
-        /* Have scheduler rebuild the domains */
-        partition_sched_domains(ndoms, doms, attr);
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online)
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
 {
-        static nodemask_t oldmems;      /* protected by cgroup_mutex */
+        schedule_work(&cpuset_hotplug_work);
-        cgroup_lock();
-        switch (action) {
-        case MEM_ONLINE:
-                oldmems = top_cpuset.mems_allowed;
-                mutex_lock(&callback_mutex);
-                top_cpuset.mems_allowed = node_states[N_MEMORY];
-                mutex_unlock(&callback_mutex);
-                update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
-                break;
-        case MEM_OFFLINE:
-                /*
-                 * needn't update top_cpuset.mems_allowed explicitly because
-                 * scan_cpusets_upon_hotplug() will update it.
-                 */
-                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
-                break;
-        default:
-                break;
-        }
-        cgroup_unlock();
        return NOTIFY_OK;
 }
 #endif
@@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void)
        hotplug_memory_notifier(cpuset_track_online_nodes, 10);
-        cpuset_wq = create_singlethread_workqueue("cpuset");
+        cpuset_propagate_hotplug_wq =
-        BUG_ON(!cpuset_wq);
+                alloc_ordered_workqueue("cpuset_hotplug", 0);
+        BUG_ON(!cpuset_propagate_hotplug_wq);
 }
 /**
@@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 */
 static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 {
-        while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
+        while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
-                cs = cs->parent;
+                cs = parent_cs(cs);
        return cs;
 }
@@ -2412,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 }
 /**
- * cpuset_unlock - release lock on cpuset changes
- *
- * Undo the lock taken in a previous cpuset_lock() call.
- */
-void cpuset_unlock(void)
-{
-        mutex_unlock(&callback_mutex);
-}
-/**
 * cpuset_mem_spread_node() - On which node to begin search for a file page
 * cpuset_slab_spread_node() - On which node to begin search for a slab page
 *
@@ -2511,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
        dentry = task_cs(tsk)->css.cgroup->dentry;
        spin_lock(&cpuset_buffer_lock);
-        snprintf(cpuset_name, CPUSET_NAME_LEN,
-                 dentry ? (const char *)dentry->d_name.name : "/");
+        if (!dentry) {
+                strcpy(cpuset_name, "/");
+        } else {
+                spin_lock(&dentry->d_lock);
+                strlcpy(cpuset_name, (const char *)dentry->d_name.name,
+                        CPUSET_NAME_LEN);
+                spin_unlock(&dentry->d_lock);
+        }
        nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
                           tsk->mems_allowed);
        printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
@@ -2560,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void)
 *  - Used for /proc/<pid>/cpuset.
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
 *    doesn't really matter if tsk->cpuset changes after we read it,
- *    and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
 *    anyway.
 */
 static int proc_cpuset_show(struct seq_file *m, void *unused_v)
@@ -2582,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
        if (!tsk)
                goto out_free;
-        retval = -EINVAL;
+        rcu_read_lock();
-        cgroup_lock();
        css = task_subsys_state(tsk, cpuset_subsys_id);
        retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
+        rcu_read_unlock();
        if (retval < 0)
-                goto out_unlock;
+                goto out_put_task;
        seq_puts(m, buf);
        seq_putc(m, '\n');
-out_unlock:
+out_put_task:
-        cgroup_unlock();
        put_task_struct(tsk);
 out_free:
        kfree(buf);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9a61738cefc8..c26278fd4851 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -29,6 +29,7 @@
 */
 #include <linux/pid_namespace.h>
 #include <linux/clocksource.h>
+#include <linux/serial_core.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/console.h>
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 3494c28a7e7a..2235967e78b0 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -72,6 +72,8 @@ extern int dbg_kdb_mode;
 #ifdef CONFIG_KGDB_KDB
 extern int kdb_stub(struct kgdb_state *ks);
 extern int kdb_parse(const char *cmdstr);
+extern int kdb_common_init_state(struct kgdb_state *ks);
+extern int kdb_common_deinit_state(void);
 #else /* ! CONFIG_KGDB_KDB */
 static inline int kdb_stub(struct kgdb_state *ks)
 {
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ce615e064482..19d9a578c753 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -31,6 +31,7 @@
 #include <linux/kernel.h>
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
+#include <linux/serial_core.h>
 #include <linux/reboot.h>
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -782,7 +783,10 @@ static void gdb_cmd_query(struct kgdb_state *ks)
                        len = len / 2;
                        remcom_out_buffer[len++] = 0;
+                        kdb_common_init_state(ks);
                        kdb_parse(remcom_out_buffer);
+                        kdb_common_deinit_state();
                        strcpy(remcom_out_buffer, "OK");
                }
                break;
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 8418c2f8ec5d..70a504601dc3 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -486,11 +486,9 @@ static int kdb_bc(int argc, const char **argv)
 /*
 * kdb_ss
 *
- *      Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
+ *      Process the 'ss' (Single Step) command.
- *      commands.
 *
 *      ss
- *      ssb
 *
 * Parameters:
 *      argc    Argument count
@@ -498,35 +496,23 @@ static int kdb_bc(int argc, const char **argv)
 * Outputs:
 *      None.
 * Returns:
- *      KDB_CMD_SS[B] for success, a kdb error if failure.
+ *      KDB_CMD_SS for success, a kdb error if failure.
 * Locking:
 *      None.
 * Remarks:
 *
 *      Set the arch specific option to trigger a debug trap after the next
 *      instruction.
- *
- *      For 'ssb', set the trace flag in the debug trap handler
- *      after printing the current insn and return directly without
- *      invoking the kdb command processor, until a branch instruction
- *      is encountered.
 */
 static int kdb_ss(int argc, const char **argv)
 {
-        int ssb = 0;
-        ssb = (strcmp(argv[0], "ssb") == 0);
        if (argc != 0)
                return KDB_ARGCOUNT;
        /*
         * Set trace flag and go.
         */
        KDB_STATE_SET(DOING_SS);
-        if (ssb) {
-                KDB_STATE_SET(DOING_SSB);
-                return KDB_CMD_SSB;
-        }
        return KDB_CMD_SS;
 }
@@ -561,8 +547,6 @@ void __init kdb_initbptab(void)
        kdb_register_repeat("ss", kdb_ss, "",
                "Single Step", 1, KDB_REPEAT_NO_ARGS);
-        kdb_register_repeat("ssb", kdb_ss, "",
-                "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
        /*
         * Architecture dependent initialization.
         */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index be7b33b73d30..328d18ef31e4 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -34,6 +34,22 @@ EXPORT_SYMBOL_GPL(kdb_poll_idx);
 static struct kgdb_state *kdb_ks;
+int kdb_common_init_state(struct kgdb_state *ks)
+{
+        kdb_initial_cpu = atomic_read(&kgdb_active);
+        kdb_current_task = kgdb_info[ks->cpu].task;
+        kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+        return 0;
+}
+int kdb_common_deinit_state(void)
+{
+        kdb_initial_cpu = -1;
+        kdb_current_task = NULL;
+        kdb_current_regs = NULL;
+        return 0;
+}
 int kdb_stub(struct kgdb_state *ks)
 {
        int error = 0;
@@ -94,13 +110,10 @@ int kdb_stub(struct kgdb_state *ks)
        }
        /* Set initial kdb state variables */
        KDB_STATE_CLEAR(KGDB_TRANS);
-        kdb_initial_cpu = atomic_read(&kgdb_active);
+        kdb_common_init_state(ks);
-        kdb_current_task = kgdb_info[ks->cpu].task;
-        kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
        /* Remove any breakpoints as needed by kdb and clear single step */
        kdb_bp_remove();
        KDB_STATE_CLEAR(DOING_SS);
-        KDB_STATE_CLEAR(DOING_SSB);
        KDB_STATE_SET(PAGER);
        /* zero out any offline cpu data */
        for_each_present_cpu(i) {
@@ -125,9 +138,7 @@ int kdb_stub(struct kgdb_state *ks)
         * Upon exit from the kdb main loop setup break points and restart
         * the system based on the requested continue state
         */
-        kdb_initial_cpu = -1;
+        kdb_common_deinit_state();
-        kdb_current_task = NULL;
-        kdb_current_regs = NULL;
        KDB_STATE_CLEAR(PAGER);
        kdbnearsym_cleanup();
        if (error == KDB_CMD_KGDB) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 8875254120b6..00eb8f7fbf41 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -124,7 +124,7 @@ static kdbmsg_t kdbmsgs[] = {
 };
 #undef KDBMSG
-static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
+static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
 /*
@@ -175,7 +175,7 @@ static char *__env[] = {
 (char *)0,
 };
-static const int __nenv = (sizeof(__env) / sizeof(char *));
+static const int __nenv = ARRAY_SIZE(__env);
 struct task_struct *kdb_curr_task(int cpu)
 {
@@ -681,34 +681,50 @@ static int kdb_defcmd(int argc, const char **argv)
        }
        if (argc != 3)
                return KDB_ARGCOUNT;
-        defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+        if (in_dbg_master()) {
-                             GFP_KDB);
+                kdb_printf("Command only available during kdb_init()\n");
-        if (!defcmd_set) {
-                kdb_printf("Could not allocate new defcmd_set entry for %s\n",
-                           argv[1]);
-                defcmd_set = save_defcmd_set;
                return KDB_NOTIMP;
        }
+        defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+                             GFP_KDB);
+        if (!defcmd_set)
+                goto fail_defcmd;
        memcpy(defcmd_set, save_defcmd_set,
               defcmd_set_count * sizeof(*defcmd_set));
-        kfree(save_defcmd_set);
        s = defcmd_set + defcmd_set_count;
        memset(s, 0, sizeof(*s));
        s->usable = 1;
        s->name = kdb_strdup(argv[1], GFP_KDB);
+        if (!s->name)
+                goto fail_name;
        s->usage = kdb_strdup(argv[2], GFP_KDB);
+        if (!s->usage)
+                goto fail_usage;
        s->help = kdb_strdup(argv[3], GFP_KDB);
+        if (!s->help)
+                goto fail_help;
        if (s->usage[0] == '"') {
-                strcpy(s->usage, s->usage+1);
+                strcpy(s->usage, argv[2]+1);
                s->usage[strlen(s->usage)-1] = '\0';
        }
        if (s->help[0] == '"') {
-                strcpy(s->help, s->help+1);
+                strcpy(s->help, argv[3]+1);
                s->help[strlen(s->help)-1] = '\0';
        }
        ++defcmd_set_count;
        defcmd_in_progress = 1;
+        kfree(save_defcmd_set);
        return 0;
+fail_help:
+        kfree(s->usage);
+fail_usage:
+        kfree(s->name);
+fail_name:
+        kfree(defcmd_set);
+fail_defcmd:
+        kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
+        defcmd_set = save_defcmd_set;
+        return KDB_NOTIMP;
 }
 /*
@@ -1112,7 +1128,6 @@ void kdb_set_current_task(struct task_struct *p)
 *      KDB_CMD_GO      User typed 'go'.
 *      KDB_CMD_CPU     User switched to another cpu.
 *      KDB_CMD_SS      Single step.
- *      KDB_CMD_SSB     Single step until branch.
 */
 static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                     kdb_dbtrap_t db_result)
@@ -1151,14 +1166,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                        kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
                                   instruction_pointer(regs));
                        break;
-                case KDB_DB_SSB:
-                        /*
-                         * In the midst of ssb command. Just return.
-                         */
-                        KDB_DEBUG_STATE("kdb_local 3", reason);
-                        return KDB_CMD_SSB;     /* Continue with SSB command */
-                        break;
                case KDB_DB_SS:
                        break;
                case KDB_DB_SSBPT:
@@ -1281,7 +1288,6 @@ do_full_getstr:
                if (diag == KDB_CMD_GO
                 || diag == KDB_CMD_CPU
                 || diag == KDB_CMD_SS
-                 || diag == KDB_CMD_SSB
                 || diag == KDB_CMD_KGDB)
                        break;
@@ -1368,12 +1374,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
                        break;
                }
-                if (result == KDB_CMD_SSB) {
-                        KDB_STATE_SET(DOING_SS);
-                        KDB_STATE_SET(DOING_SSB);
-                        break;
-                }
                if (result == KDB_CMD_KGDB) {
                        if (!KDB_STATE(DOING_KGDB))
                                kdb_printf("Entering please attach debugger "
@@ -2350,69 +2350,6 @@ static int kdb_pid(int argc, const char **argv)
        return 0;
 }
-/*
- * kdb_ll - This function implements the 'll' command which follows a
- *      linked list and executes an arbitrary command for each
- *      element.
- */
-static int kdb_ll(int argc, const char **argv)
-{
-        int diag = 0;
-        unsigned long addr;
-        long offset = 0;
-        unsigned long va;
-        unsigned long linkoffset;
-        int nextarg;
-        const char *command;
-        if (argc != 3)
-                return KDB_ARGCOUNT;
-        nextarg = 1;
-        diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
-        if (diag)
-                return diag;
-        diag = kdbgetularg(argv[2], &linkoffset);
-        if (diag)
-                return diag;
-        /*
-         * Using the starting address as
-         * the first element in the list, and assuming that
-         * the list ends with a null pointer.
-         */
-        va = addr;
-        command = kdb_strdup(argv[3], GFP_KDB);
-        if (!command) {
-                kdb_printf("%s: cannot duplicate command\n", __func__);
-                return 0;
-        }
-        /* Recursive use of kdb_parse, do not use argv after this point */
-        argv = NULL;
-        while (va) {
-                char buf[80];
-                if (KDB_FLAG(CMD_INTERRUPT))
-                        goto out;
-                sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
-                diag = kdb_parse(buf);
-                if (diag)
-                        goto out;
-                addr = va + linkoffset;
-                if (kdb_getword(&va, addr, sizeof(va)))
-                        goto out;
-        }
-out:
-        kfree(command);
-        return diag;
-}
 static int kdb_kgdb(int argc, const char **argv)
 {
        return KDB_CMD_KGDB;
@@ -2430,11 +2367,15 @@ static int kdb_help(int argc, const char **argv)
        kdb_printf("-----------------------------"
                   "-----------------------------\n");
        for_each_kdbcmd(kt, i) {
-                if (kt->cmd_name)
+                char *space = "";
-                        kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
-                                   kt->cmd_usage, kt->cmd_help);
                if (KDB_FLAG(CMD_INTERRUPT))
                        return 0;
+                if (!kt->cmd_name)
+                        continue;
+                if (strlen(kt->cmd_usage) > 20)
+                        space = "\n                                    ";
+                kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
+                           kt->cmd_usage, space, kt->cmd_help);
        }
        return 0;
 }
@@ -2739,7 +2680,7 @@ int kdb_register_repeat(char *cmd,
                          (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
                        kfree(kdb_commands);
                }
-                memset(new + kdb_max_commands, 0,
+                memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
                       kdb_command_extend * sizeof(*new));
                kdb_commands = new;
                kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
@@ -2843,15 +2784,13 @@ static void __init kdb_inittab(void)
          "Stack traceback", 1, KDB_REPEAT_NONE);
        kdb_register_repeat("btp", kdb_bt, "<pid>",
          "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
-        kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
+        kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
-          "Display stack all processes", 0, KDB_REPEAT_NONE);
+          "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
        kdb_register_repeat("btc", kdb_bt, "",
          "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
        kdb_register_repeat("btt", kdb_bt, "<vaddr>",
          "Backtrace process given its struct task address", 0,
                            KDB_REPEAT_NONE);
-        kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
-          "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
        kdb_register_repeat("env", kdb_env, "",
          "Show environment variables", 0, KDB_REPEAT_NONE);
        kdb_register_repeat("set", kdb_set, "",
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 392ec6a25844..7afd3c8c41d5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -19,7 +19,6 @@
 #define KDB_CMD_GO      (-1001)
 #define KDB_CMD_CPU     (-1002)
 #define KDB_CMD_SS      (-1003)
-#define KDB_CMD_SSB     (-1004)
 #define KDB_CMD_KGDB (-1005)
 /* Internal debug flags */
@@ -125,8 +124,6 @@ extern int kdb_state;
                                                 * kdb control */
 #define KDB_STATE_HOLD_CPU      0x00000010      /* Hold this cpu inside kdb */
 #define KDB_STATE_DOING_SS      0x00000020      /* Doing ss command */
-#define KDB_STATE_DOING_SSB     0x00000040      /* Doing ssb command,
-                                                 * DOING_SS is also set */
 #define KDB_STATE_SSBPT         0x00000080      /* Install breakpoint
                                                 * after one ss, independent of
                                                 * DOING_SS */
@@ -191,7 +188,6 @@ extern void kdb_bp_remove(void);
 typedef enum {
        KDB_DB_BPT,     /* Breakpoint */
        KDB_DB_SS,      /* Single-step trap */
-        KDB_DB_SSB,     /* Single step to branch */
        KDB_DB_SSBPT,   /* Single step over breakpoint */
        KDB_DB_NOBPT    /* Spurious breakpoint */
 } kdb_dbtrap_t;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5c75791d7269..59412d037eed 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3691,7 +3691,7 @@ unlock:
 static int perf_fasync(int fd, struct file *filp, int on)
 {
-        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct inode *inode = file_inode(filp);
        struct perf_event *event = filp->private_data;
        int retval;
@@ -4434,12 +4434,15 @@ static void perf_event_task_event(struct perf_task_event *task_event)
                        if (ctxn < 0)
                                goto next;
                        ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+                        if (ctx)
+                                perf_event_task_ctx(ctx, task_event);
                }
-                if (ctx)
-                        perf_event_task_ctx(ctx, task_event);
 next:
                put_cpu_ptr(pmu->pmu_cpu_context);
        }
+        if (task_event->task_ctx)
+                perf_event_task_ctx(task_event->task_ctx, task_event);
        rcu_read_unlock();
 }
@@ -5126,7 +5129,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 {
        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
        struct perf_event *event;
-        struct hlist_node *node;
        struct hlist_head *head;
        rcu_read_lock();
@@ -5134,7 +5136,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
        if (!head)
                goto end;
-        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_event(event, nr, data, regs);
        }
@@ -5419,7 +5421,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 {
        struct perf_sample_data data;
        struct perf_event *event;
-        struct hlist_node *node;
        struct perf_raw_record raw = {
                .size = entry_size,
@@ -5429,7 +5430,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
        perf_sample_data_init(&data, addr, 0);
        data.raw = &raw;
-        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_tp_event_match(event, &data, regs))
                        perf_swevent_event(event, count, &data, regs);
        }
@@ -5649,6 +5650,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
                event->attr.sample_period = NSEC_PER_SEC / freq;
                hwc->sample_period = event->attr.sample_period;
                local64_set(&hwc->period_left, hwc->sample_period);
+                hwc->last_period = hwc->sample_period;
                event->attr.freq = 0;
        }
 }
@@ -5965,13 +5967,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
        pmu->name = name;
        if (type < 0) {
-                int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
+                type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
-                if (!err)
+                if (type < 0) {
-                        goto free_pdc;
+                        ret = type;
-                err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
-                if (err) {
-                        ret = err;
                        goto free_pdc;
                }
        }
diff --git a/kernel/exit.c b/kernel/exit.c
index 7dd20408707c..51e485ca9935 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -20,6 +20,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/freezer.h>
 #include <linux/binfmts.h>
 #include <linux/nsproxy.h>
 #include <linux/pid_namespace.h>
@@ -31,7 +32,6 @@
 #include <linux/mempolicy.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
-#include <linux/freezer.h>
 #include <linux/cgroup.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
@@ -485,7 +485,7 @@ static void exit_mm(struct task_struct * tsk)
                        set_task_state(tsk, TASK_UNINTERRUPTIBLE);
                        if (!self.task) /* see coredump_finish() */
                                break;
-                        schedule();
+                        freezable_schedule();
                }
                __set_task_state(tsk, TASK_RUNNING);
                down_read(&mm->mmap_sem);
@@ -835,7 +835,7 @@ void do_exit(long code)
        /*
         * Make sure we are holding no locks:
         */
-        debug_check_no_locks_held(tsk);
+        debug_check_no_locks_held();
        /*
         * We can do this unlocked here. The futex code uses this flag
         * just to verify whether the pi state cleanup has been done
diff --git a/kernel/fork.c b/kernel/fork.c
index 4133876d8cd2..1766d324d5e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -413,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                tmp->vm_next = tmp->vm_prev = NULL;
                file = tmp->vm_file;
                if (file) {
-                        struct inode *inode = file->f_path.dentry->d_inode;
+                        struct inode *inode = file_inode(file);
                        struct address_space *mapping = file->f_mapping;
                        get_file(file);
@@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);
+        if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+                return ERR_PTR(-EINVAL);
        /*
         * Thread groups must share signals as well, and detached threads
         * can only be started up within the thread group.
@@ -1807,7 +1810,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
         * If unsharing a user namespace must also unshare the thread.
         */
        if (unshare_flags & CLONE_NEWUSER)
-                unshare_flags |= CLONE_THREAD;
+                unshare_flags |= CLONE_THREAD | CLONE_FS;
        /*
         * If unsharing a pid namespace must also unshare the thread.
         */
@@ -1861,10 +1864,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                        exit_sem(current);
                }
-                if (new_nsproxy) {
+                if (new_nsproxy)
                        switch_task_namespaces(current, new_nsproxy);
-                        new_nsproxy = NULL;
-                }
                task_lock(current);
@@ -1894,9 +1895,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                }
        }
-        if (new_nsproxy)
-                put_nsproxy(new_nsproxy);
 bad_unshare_cleanup_cred:
        if (new_cred)
                put_cred(new_cred);
diff --git a/kernel/futex.c b/kernel/futex.c
index 9618b6e9fb36..b26dcfc02c94 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -223,10 +223,11 @@ static void drop_futex_key_refs(union futex_key *key)
 * @rw:         mapping needs to be read/write (values: VERIFY_READ,
 *              VERIFY_WRITE)
 *
- * Returns a negative error code or 0
+ * Return: a negative error code or 0
+ *
 * The key words are stored in *key on success.
 *
- * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
+ * For shared mappings, it's (page->index, file_inode(vma->vm_file),
 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 * We can usually work out the index without swapping in the page.
 *
@@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 *                      be "current" except in the case of requeue pi.
 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
 *
- * Returns:
+ * Return:
- *  0 - ready to wait
+ *  0 - ready to wait;
- *  1 - acquired the lock
+ *  1 - acquired the lock;
 * <0 - error
 *
 * The hb->lock and futex_key refs shall be held by the caller.
@@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
 * hb1 and hb2 must be held by the caller.
 *
- * Returns:
+ * Return:
- *  0 - failed to acquire the lock atomicly
+ *  0 - failed to acquire the lock atomically;
- *  1 - acquired the lock
+ *  1 - acquired the lock;
 * <0 - error
 */
 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
 * uaddr2 atomically on behalf of the top waiter.
 *
- * Returns:
+ * Return:
- * >=0 - on success, the number of tasks requeued or woken
+ * >=0 - on success, the number of tasks requeued or woken;
 *  <0 - on error
 */
 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
 * be paired with exactly one earlier call to queue_me().
 *
- * Returns:
+ * Return:
- *   1 - if the futex_q was still queued (and we removed unqueued it)
+ *   1 - if the futex_q was still queued (and we removed unqueued it);
 *   0 - if the futex_q was already removed by the waking thread
 */
 static int unqueue_me(struct futex_q *q)
@@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);
 * the pi_state owner as well as handle race conditions that may allow us to
 * acquire the lock. Must be called with the hb lock held.
 *
- * Returns:
+ * Return:
- *  1 - success, lock taken
+ *  1 - success, lock taken;
- *  0 - success, lock not taken
+ *  0 - success, lock not taken;
 * <0 - on error (-EFAULT)
 */
 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 * Return with the hb lock held and a q.key reference on success, and unlocked
 * with no q.key reference on failure.
 *
- * Returns:
+ * Return:
- *  0 - uaddr contains val and hb has been locked
+ *  0 - uaddr contains val and hb has been locked;
 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
 */
 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -2203,9 +2204,9 @@ pi_faulted:
 * the wakeup and return the appropriate error code to the caller.  Must be
 * called with the hb lock held.
 *
- * Returns
+ * Return:
- *  0 - no early wakeup detected
+ *  0 = no early wakeup detected;
- * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ * <0 = -ETIMEDOUT or -ERESTARTNOINTR
 */
 static inline
 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 * @val:        the expected value of uaddr
 * @abs_time:   absolute timeout
 * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
- * @clockrt:    whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
 * @uaddr2:     the pi futex we will take prior to returning to user-space
 *
 * The caller will wait on uaddr and will be requeued by futex_requeue() to
@@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 * there was a need to.
 *
 * We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following:
+ * via the following--
 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
 * 2) wakeup on uaddr2 after a requeue
 * 3) signal
@@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 *
 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
 *
- * Returns:
+ * Return:
- *  0 - On success
+ *  0 - On success;
 * <0 - On error
 */
 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
@@ -2472,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
-        WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
        rcu_read_lock();
        ret = -ESRCH;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 83e368b005fc..f9f44fd4d34d 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -11,6 +11,7 @@
 #include <linux/nsproxy.h>
 #include <linux/futex.h>
 #include <linux/ptrace.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
@@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr)
        }
 }
-asmlinkage long
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
-compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+                struct compat_robust_list_head __user *, head,
-                           compat_size_t len)
+                compat_size_t, len)
 {
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
@@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
        return 0;
 }
-asmlinkage long
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
-compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
+                        compat_uptr_t __user *, head_ptr,
-                           compat_size_t __user *len_ptr)
+                        compat_size_t __user *, len_ptr)
 {
        struct compat_robust_list_head __user *head;
        unsigned long ret;
@@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
-        WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
        rcu_read_lock();
        ret = -ESRCH;
@@ -172,9 +171,9 @@ err_unlock:
        return ret;
 }
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
-                struct compat_timespec __user *utime, u32 __user *uaddr2,
+                struct compat_timespec __user *, utime, u32 __user *, uaddr2,
-                u32 val3)
+                u32, val3)
 {
        struct timespec ts;
        ktime_t t, *tp = NULL;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index a92028196cc1..d4da55d1fb65 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
 config GCOV_PROFILE_ALL
        bool "Profile entire Kernel"
        depends on GCOV_KERNEL
-        depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+        depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
        default n
        ---help---
        This options activates profiling for the entire kernel.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4bd4faa6323a..397db02209ed 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
 static ssize_t write_irq_affinity(int type, struct file *file,
                const char __user *buffer, size_t count, loff_t *pos)
 {
-        unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
+        unsigned int irq = (int)(long)PDE(file_inode(file))->data;
        cpumask_var_t new_value;
        int err;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5e4bd7864c5d..bddd3d7a74b6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -54,6 +54,12 @@ struct resource crashk_res = {
        .end   = 0,
        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
 };
+struct resource crashk_low_res = {
+        .name  = "Crash kernel low",
+        .start = 0,
+        .end   = 0,
+        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
 int kexec_should_crash(struct task_struct *p)
 {
@@ -223,6 +229,8 @@ out:
 }
+static void kimage_free_page_list(struct list_head *list);
 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
                                unsigned long nr_segments,
                                struct kexec_segment __user *segments)
@@ -236,8 +244,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
        if (result)
                goto out;
-        *rimage = image;
        /*
         * Find a location for the control code buffer, and add it
         * the vector of segments so that it's pages will also be
@@ -248,22 +254,22 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
                printk(KERN_ERR "Could not allocate control_code_buffer\n");
-                goto out;
+                goto out_free;
        }
        image->swap_page = kimage_alloc_control_pages(image, 0);
        if (!image->swap_page) {
                printk(KERN_ERR "Could not allocate swap buffer\n");
-                goto out;
+                goto out_free;
        }
-        result = 0;
+        *rimage = image;
- out:
+        return 0;
-        if (result == 0)
-                *rimage = image;
-        else
-                kfree(image);
+out_free:
+        kimage_free_page_list(&image->control_pages);
+        kfree(image);
+out:
        return result;
 }
@@ -310,7 +316,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
                mend = mstart + image->segment[i].memsz - 1;
                /* Ensure we are within the crash kernel limits */
                if ((mstart < crashk_res.start) || (mend > crashk_res.end))
-                        goto out;
+                        goto out_free;
        }
        /*
@@ -323,16 +329,15 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
                printk(KERN_ERR "Could not allocate control_code_buffer\n");
-                goto out;
+                goto out_free;
        }
-        result = 0;
+        *rimage = image;
-out:
+        return 0;
-        if (result == 0)
-                *rimage = image;
-        else
-                kfree(image);
+out_free:
+        kfree(image);
+out:
        return result;
 }
@@ -497,8 +502,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
                if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
                        break;
-                if (hole_end > crashk_res.end)
-                        break;
                /* See if I overlap any of the segments */
                for (i = 0; i < image->nr_segments; i++) {
                        unsigned long mstart, mend;
@@ -1369,10 +1372,11 @@ static int __init parse_crashkernel_simple(char 		*cmdline,
 * That function is the entry point for command line parsing and should be
 * called from the arch-specific code.
 */
-int __init parse_crashkernel(char                *cmdline,
+static int __init __parse_crashkernel(char *cmdline,
                             unsigned long long system_ram,
                             unsigned long long *crash_size,
-                             unsigned long long *crash_base)
+                             unsigned long long *crash_base,
+                                const char *name)
 {
        char    *p = cmdline, *ck_cmdline = NULL;
        char    *first_colon, *first_space;
@@ -1382,16 +1386,16 @@ int __init parse_crashkernel(char 		 *cmdline,
        *crash_base = 0;
        /* find crashkernel and use the last one if there are more */
-        p = strstr(p, "crashkernel=");
+        p = strstr(p, name);
        while (p) {
                ck_cmdline = p;
-                p = strstr(p+1, "crashkernel=");
+                p = strstr(p+1, name);
        }
        if (!ck_cmdline)
                return -EINVAL;
-        ck_cmdline += 12; /* strlen("crashkernel=") */
+        ck_cmdline += strlen(name);
        /*
         * if the commandline contains a ':', then that's the extended
@@ -1409,6 +1413,23 @@ int __init parse_crashkernel(char 		 *cmdline,
        return 0;
 }
+int __init parse_crashkernel(char *cmdline,
+                             unsigned long long system_ram,
+                             unsigned long long *crash_size,
+                             unsigned long long *crash_base)
+{
+        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                        "crashkernel=");
+}
+int __init parse_crashkernel_low(char *cmdline,
+                             unsigned long long system_ram,
+                             unsigned long long *crash_size,
+                             unsigned long long *crash_base)
+{
+        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                        "crashkernel_low=");
+}
 static void update_vmcoreinfo_note(void)
 {
@@ -1490,6 +1511,8 @@ static int __init crash_save_vmcoreinfo_init(void)
        VMCOREINFO_OFFSET(page, _count);
        VMCOREINFO_OFFSET(page, mapping);
        VMCOREINFO_OFFSET(page, lru);
+        VMCOREINFO_OFFSET(page, _mapcount);
+        VMCOREINFO_OFFSET(page, private);
        VMCOREINFO_OFFSET(pglist_data, node_zones);
        VMCOREINFO_OFFSET(pglist_data, nr_zones);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1512,6 +1535,11 @@ static int __init crash_save_vmcoreinfo_init(void)
        VMCOREINFO_NUMBER(PG_lru);
        VMCOREINFO_NUMBER(PG_private);
        VMCOREINFO_NUMBER(PG_swapcache);
+        VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+        VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+        VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
        arch_crash_save_vmcoreinfo();
        update_vmcoreinfo_note();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
deleted file mode 100644
index 59dcf5b81d24..000000000000
--- a/kernel/kfifo.c
+++ /dev/null
@@ -1,609 +0,0 @@
-/*
- * A generic kernel FIFO implementation
- *
- * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/log2.h>
-#include <linux/uaccess.h>
-#include <linux/kfifo.h>
-/*
- * internal helper to calculate the unused elements in a fifo
- */
-static inline unsigned int kfifo_unused(struct __kfifo *fifo)
-{
-        return (fifo->mask + 1) - (fifo->in - fifo->out);
-}
-int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
-                size_t esize, gfp_t gfp_mask)
-{
-        /*
-         * round down to the next power of 2, since our 'let the indices
-         * wrap' technique works only in this case.
-         */
-        if (!is_power_of_2(size))
-                size = rounddown_pow_of_two(size);
-        fifo->in = 0;
-        fifo->out = 0;
-        fifo->esize = esize;
-        if (size < 2) {
-                fifo->data = NULL;
-                fifo->mask = 0;
-                return -EINVAL;
-        }
-        fifo->data = kmalloc(size * esize, gfp_mask);
-        if (!fifo->data) {
-                fifo->mask = 0;
-                return -ENOMEM;
-        }
-        fifo->mask = size - 1;
-        return 0;
-}
-EXPORT_SYMBOL(__kfifo_alloc);
-void __kfifo_free(struct __kfifo *fifo)
-{
-        kfree(fifo->data);
-        fifo->in = 0;
-        fifo->out = 0;
-        fifo->esize = 0;
-        fifo->data = NULL;
-        fifo->mask = 0;
-}
-EXPORT_SYMBOL(__kfifo_free);
-int __kfifo_init(struct __kfifo *fifo, void *buffer,
-                unsigned int size, size_t esize)
-{
-        size /= esize;
-        if (!is_power_of_2(size))
-                size = rounddown_pow_of_two(size);
-        fifo->in = 0;
-        fifo->out = 0;
-        fifo->esize = esize;
-        fifo->data = buffer;
-        if (size < 2) {
-                fifo->mask = 0;
-                return -EINVAL;
-        }
-        fifo->mask = size - 1;
-        return 0;
-}
-EXPORT_SYMBOL(__kfifo_init);
-static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
-                unsigned int len, unsigned int off)
-{
-        unsigned int size = fifo->mask + 1;
-        unsigned int esize = fifo->esize;
-        unsigned int l;
-        off &= fifo->mask;
-        if (esize != 1) {
-                off *= esize;
-                size *= esize;
-                len *= esize;
-        }
-        l = min(len, size - off);
-        memcpy(fifo->data + off, src, l);
-        memcpy(fifo->data, src + l, len - l);
-        /*
-         * make sure that the data in the fifo is up to date before
-         * incrementing the fifo->in index counter
-         */
-        smp_wmb();
-}
-unsigned int __kfifo_in(struct __kfifo *fifo,
-                const void *buf, unsigned int len)
-{
-        unsigned int l;
-        l = kfifo_unused(fifo);
-        if (len > l)
-                len = l;
-        kfifo_copy_in(fifo, buf, len, fifo->in);
-        fifo->in += len;
-        return len;
-}
-EXPORT_SYMBOL(__kfifo_in);
-static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
-                unsigned int len, unsigned int off)
-{
-        unsigned int size = fifo->mask + 1;
-        unsigned int esize = fifo->esize;
-        unsigned int l;
-        off &= fifo->mask;
-        if (esize != 1) {
-                off *= esize;
-                size *= esize;
-                len *= esize;
-        }
-        l = min(len, size - off);
-        memcpy(dst, fifo->data + off, l);
-        memcpy(dst + l, fifo->data, len - l);
-        /*
-         * make sure that the data is copied before
-         * incrementing the fifo->out index counter
-         */
-        smp_wmb();
-}
-unsigned int __kfifo_out_peek(struct __kfifo *fifo,
-                void *buf, unsigned int len)
-{
-        unsigned int l;
-        l = fifo->in - fifo->out;
-        if (len > l)
-                len = l;
-        kfifo_copy_out(fifo, buf, len, fifo->out);
-        return len;
-}
-EXPORT_SYMBOL(__kfifo_out_peek);
-unsigned int __kfifo_out(struct __kfifo *fifo,
-                void *buf, unsigned int len)
-{
-        len = __kfifo_out_peek(fifo, buf, len);
-        fifo->out += len;
-        return len;
-}
-EXPORT_SYMBOL(__kfifo_out);
-static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
-        const void __user *from, unsigned int len, unsigned int off,
-        unsigned int *copied)
-{
-        unsigned int size = fifo->mask + 1;
-        unsigned int esize = fifo->esize;
-        unsigned int l;
-        unsigned long ret;
-        off &= fifo->mask;
-        if (esize != 1) {
-                off *= esize;
-                size *= esize;
-                len *= esize;
-        }
-        l = min(len, size - off);
-        ret = copy_from_user(fifo->data + off, from, l);
-        if (unlikely(ret))
-                ret = DIV_ROUND_UP(ret + len - l, esize);
-        else {
-                ret = copy_from_user(fifo->data, from + l, len - l);
-                if (unlikely(ret))
-                        ret = DIV_ROUND_UP(ret, esize);
-        }
-        /*
-         * make sure that the data in the fifo is up to date before
-         * incrementing the fifo->in index counter
-         */
-        smp_wmb();
-        *copied = len - ret;
-        /* return the number of elements which are not copied */
-        return ret;
-}
-int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
-                unsigned long len, unsigned int *copied)
-{
-        unsigned int l;
-        unsigned long ret;
-        unsigned int esize = fifo->esize;
-        int err;
-        if (esize != 1)
-                len /= esize;
-        l = kfifo_unused(fifo);
-        if (len > l)
-                len = l;
-        ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
-        if (unlikely(ret)) {
-                len -= ret;
-                err = -EFAULT;
-        } else
-                err = 0;
-        fifo->in += len;
-        return err;
-}
-EXPORT_SYMBOL(__kfifo_from_user);
-static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
-                unsigned int len, unsigned int off, unsigned int *copied)
-{
-        unsigned int l;
-        unsigned long ret;
-        unsigned int size = fifo->mask + 1;
-        unsigned int esize = fifo->esize;
-        off &= fifo->mask;
-        if (esize != 1) {
-                off *= esize;
-                size *= esize;
-                len *= esize;
-        }
-        l = min(len, size - off);
-        ret = copy_to_user(to, fifo->data + off, l);
-        if (unlikely(ret))
-                ret = DIV_ROUND_UP(ret + len - l, esize);
-        else {
-                ret = copy_to_user(to + l, fifo->data, len - l);
-                if (unlikely(ret))
-                        ret = DIV_ROUND_UP(ret, esize);
-        }
-        /*
-         * make sure that the data is copied before
-         * incrementing the fifo->out index counter
-         */
-        smp_wmb();
-        *copied = len - ret;
-        /* return the number of elements which are not copied */
-        return ret;
-}
-int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
-                unsigned long len, unsigned int *copied)
-{
-        unsigned int l;
-        unsigned long ret;
-        unsigned int esize = fifo->esize;
-        int err;
-        if (esize != 1)
-                len /= esize;
-        l = fifo->in - fifo->out;
-        if (len > l)
-                len = l;
-        ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
-        if (unlikely(ret)) {
-                len -= ret;
-                err = -EFAULT;
-        } else
-                err = 0;
-        fifo->out += len;
-        return err;
-}
-EXPORT_SYMBOL(__kfifo_to_user);
-static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
-                int nents, unsigned int len)
-{
-        int n;
-        unsigned int l;
-        unsigned int off;
-        struct page *page;
-        if (!nents)
-                return 0;
-        if (!len)
-                return 0;
-        n = 0;
-        page = virt_to_page(buf);
-        off = offset_in_page(buf);
-        l = 0;
-        while (len >= l + PAGE_SIZE - off) {
-                struct page *npage;
-                l += PAGE_SIZE;
-                buf += PAGE_SIZE;
-                npage = virt_to_page(buf);
-                if (page_to_phys(page) != page_to_phys(npage) - l) {
-                        sg_set_page(sgl, page, l - off, off);
-                        sgl = sg_next(sgl);
-                        if (++n == nents || sgl == NULL)
-                                return n;
-                        page = npage;
-                        len -= l - off;
-                        l = off = 0;
-                }
-        }
-        sg_set_page(sgl, page, len, off);
-        return n + 1;
-}
-static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
-                int nents, unsigned int len, unsigned int off)
-{
-        unsigned int size = fifo->mask + 1;
-        unsigned int esize = fifo->esize;
-        unsigned int l;
-        unsigned int n;
-        off &= fifo->mask;
-        if (esize != 1) {
-                off *= esize;
-                size *= esize;
-                len *= esize;
-        }
-        l = min(len, size - off);
-        n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
-        n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
-        return n;
-}
-unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
-                struct scatterlist *sgl, int nents, unsigned int len)
-{
-        unsigned int l;
-        l = kfifo_unused(fifo);
-        if (len > l)
-                len = l;
-        return setup_sgl(fifo, sgl, nents, len, fifo->in);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare);
-unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
-                struct scatterlist *sgl, int nents, unsigned int len)
-{
-        unsigned int l;
-        l = fifo->in - fifo->out;
-        if (len > l)
-                len = l;
-        return setup_sgl(fifo, sgl, nents, len, fifo->out);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare);
-unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
-{
-        unsigned int max = (1 << (recsize << 3)) - 1;
-        if (len > max)
-                return max;
-        return len;
-}
-EXPORT_SYMBOL(__kfifo_max_r);
-#define __KFIFO_PEEK(data, out, mask) \
-        ((data)[(out) & (mask)])
-/*
- * __kfifo_peek_n internal helper function for determinate the length of
- * the next record in the fifo
- */
-static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
-{
-        unsigned int l;
-        unsigned int mask = fifo->mask;
-        unsigned char *data = fifo->data;
-        l = __KFIFO_PEEK(data, fifo->out, mask);
-        if (--recsize)
-                l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
-        return l;
-}
-#define __KFIFO_POKE(data, in, mask, val) \
-        ( \
-        (data)[(in) & (mask)] = (unsigned char)(val) \
-        )
-/*
- * __kfifo_poke_n internal helper function for storeing the length of
- * the record into the fifo
- */
-static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
-{
-        unsigned int mask = fifo->mask;
-        unsigned char *data = fifo->data;
-        __KFIFO_POKE(data, fifo->in, mask, n);
-        if (recsize > 1)
-                __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
-}
-unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
-{
-        return __kfifo_peek_n(fifo, recsize);
-}
-EXPORT_SYMBOL(__kfifo_len_r);
-unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
-                unsigned int len, size_t recsize)
-{
-        if (len + recsize > kfifo_unused(fifo))
-                return 0;
-        __kfifo_poke_n(fifo, len, recsize);
-        kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
-        fifo->in += len + recsize;
-        return len;
-}
-EXPORT_SYMBOL(__kfifo_in_r);
-static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
-        void *buf, unsigned int len, size_t recsize, unsigned int *n)
-{
-        *n = __kfifo_peek_n(fifo, recsize);
-        if (len > *n)
-                len = *n;
-        kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
-        return len;
-}
-unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
-                unsigned int len, size_t recsize)
-{
-        unsigned int n;
-        if (fifo->in == fifo->out)
-                return 0;
-        return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
-}
-EXPORT_SYMBOL(__kfifo_out_peek_r);
-unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
-                unsigned int len, size_t recsize)
-{
-        unsigned int n;
-        if (fifo->in == fifo->out)
-                return 0;
-        len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
-        fifo->out += n + recsize;
-        return len;
-}
-EXPORT_SYMBOL(__kfifo_out_r);
-void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
-{
-        unsigned int n;
-        n = __kfifo_peek_n(fifo, recsize);
-        fifo->out += n + recsize;
-}
-EXPORT_SYMBOL(__kfifo_skip_r);
-int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
-        unsigned long len, unsigned int *copied, size_t recsize)
-{
-        unsigned long ret;
-        len = __kfifo_max_r(len, recsize);
-        if (len + recsize > kfifo_unused(fifo)) {
-                *copied = 0;
-                return 0;
-        }
-        __kfifo_poke_n(fifo, len, recsize);
-        ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
-        if (unlikely(ret)) {
-                *copied = 0;
-                return -EFAULT;
-        }
-        fifo->in += len + recsize;
-        return 0;
-}
-EXPORT_SYMBOL(__kfifo_from_user_r);
-int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
-        unsigned long len, unsigned int *copied, size_t recsize)
-{
-        unsigned long ret;
-        unsigned int n;
-        if (fifo->in == fifo->out) {
-                *copied = 0;
-                return 0;
-        }
-        n = __kfifo_peek_n(fifo, recsize);
-        if (len > n)
-                len = n;
-        ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
-        if (unlikely(ret)) {
-                *copied = 0;
-                return -EFAULT;
-        }
-        fifo->out += n + recsize;
-        return 0;
-}
-EXPORT_SYMBOL(__kfifo_to_user_r);
-unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
-        struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
-        if (!nents)
-                BUG();
-        len = __kfifo_max_r(len, recsize);
-        if (len + recsize > kfifo_unused(fifo))
-                return 0;
-        return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
-void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
-        unsigned int len, size_t recsize)
-{
-        len = __kfifo_max_r(len, recsize);
-        __kfifo_poke_n(fifo, len, recsize);
-        fifo->in += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
-unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
-        struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
-        if (!nents)
-                BUG();
-        len = __kfifo_max_r(len, recsize);
-        if (len + recsize > fifo->in - fifo->out)
-                return 0;
-        return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
-void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
-{
-        unsigned int len;
-        len = __kfifo_peek_n(fifo, recsize);
-        fifo->out += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 550294d58a02..e35be53f6613 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -334,11 +334,10 @@ static inline void reset_kprobe_instance(void)
 struct kprobe __kprobes *get_kprobe(void *addr)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p;
        head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
-        hlist_for_each_entry_rcu(p, node, head, hlist) {
+        hlist_for_each_entry_rcu(p, head, hlist) {
                if (p->addr == addr)
                        return p;
        }
@@ -799,7 +798,6 @@ out:
 static void __kprobes optimize_all_kprobes(void)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p;
        unsigned int i;
@@ -810,7 +808,7 @@ static void __kprobes optimize_all_kprobes(void)
        kprobes_allow_optimization = true;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
-                hlist_for_each_entry_rcu(p, node, head, hlist)
+                hlist_for_each_entry_rcu(p, head, hlist)
                        if (!kprobe_disabled(p))
                                optimize_kprobe(p);
        }
@@ -821,7 +819,6 @@ static void __kprobes optimize_all_kprobes(void)
 static void __kprobes unoptimize_all_kprobes(void)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p;
        unsigned int i;
@@ -832,7 +829,7 @@ static void __kprobes unoptimize_all_kprobes(void)
        kprobes_allow_optimization = false;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
-                hlist_for_each_entry_rcu(p, node, head, hlist) {
+                hlist_for_each_entry_rcu(p, head, hlist) {
                        if (!kprobe_disabled(p))
                                unoptimize_kprobe(p, false);
                }
@@ -1148,7 +1145,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 {
        struct kretprobe_instance *ri;
        struct hlist_head *head, empty_rp;
-        struct hlist_node *node, *tmp;
+        struct hlist_node *tmp;
        unsigned long hash, flags = 0;
        if (unlikely(!kprobes_initialized))
@@ -1159,12 +1156,12 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
        hash = hash_ptr(tk, KPROBE_HASH_BITS);
        head = &kretprobe_inst_table[hash];
        kretprobe_table_lock(hash, &flags);
-        hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+        hlist_for_each_entry_safe(ri, tmp, head, hlist) {
                if (ri->task == tk)
                        recycle_rp_inst(ri, &empty_rp);
        }
        kretprobe_table_unlock(hash, &flags);
-        hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+        hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
                hlist_del(&ri->hlist);
                kfree(ri);
        }
@@ -1173,9 +1170,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 static inline void free_rp_inst(struct kretprobe *rp)
 {
        struct kretprobe_instance *ri;
-        struct hlist_node *pos, *next;
+        struct hlist_node *next;
-        hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
+        hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
                hlist_del(&ri->hlist);
                kfree(ri);
        }
@@ -1185,14 +1182,14 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
 {
        unsigned long flags, hash;
        struct kretprobe_instance *ri;
-        struct hlist_node *pos, *next;
+        struct hlist_node *next;
        struct hlist_head *head;
        /* No race here */
        for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
                kretprobe_table_lock(hash, &flags);
                head = &kretprobe_inst_table[hash];
-                hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+                hlist_for_each_entry_safe(ri, next, head, hlist) {
                        if (ri->rp == rp)
                                ri->rp = NULL;
                }
@@ -2028,7 +2025,6 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
 {
        struct module *mod = data;
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p;
        unsigned int i;
        int checkcore = (val == MODULE_STATE_GOING);
@@ -2045,7 +2041,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
        mutex_lock(&kprobe_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
-                hlist_for_each_entry_rcu(p, node, head, hlist)
+                hlist_for_each_entry_rcu(p, head, hlist)
                        if (within_module_init((unsigned long)p->addr, mod) ||
                            (checkcore &&
                             within_module_core((unsigned long)p->addr, mod))) {
@@ -2192,7 +2188,6 @@ static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
 static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p, *kp;
        const char *sym = NULL;
        unsigned int i = *(loff_t *) v;
@@ -2201,7 +2196,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
        head = &kprobe_table[i];
        preempt_disable();
-        hlist_for_each_entry_rcu(p, node, head, hlist) {
+        hlist_for_each_entry_rcu(p, head, hlist) {
                sym = kallsyms_lookup((unsigned long)p->addr, NULL,
                                        &offset, &modname, namebuf);
                if (kprobe_aggrprobe(p)) {
@@ -2236,7 +2231,6 @@ static const struct file_operations debugfs_kprobes_operations = {
 static void __kprobes arm_all_kprobes(void)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p;
        unsigned int i;
@@ -2249,7 +2243,7 @@ static void __kprobes arm_all_kprobes(void)
        /* Arming kprobes doesn't optimize kprobe itself */
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
-                hlist_for_each_entry_rcu(p, node, head, hlist)
+                hlist_for_each_entry_rcu(p, head, hlist)
                        if (!kprobe_disabled(p))
                                arm_kprobe(p);
        }
@@ -2265,7 +2259,6 @@ already_enabled:
 static void __kprobes disarm_all_kprobes(void)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p;
        unsigned int i;
@@ -2282,7 +2275,7 @@ static void __kprobes disarm_all_kprobes(void)
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
-                hlist_for_each_entry_rcu(p, node, head, hlist) {
+                hlist_for_each_entry_rcu(p, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
                                disarm_kprobe(p, false);
                }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b2350d..259db207b5d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 #endif
        if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
                debug_locks_off();
-                printk("BUG: MAX_LOCK_DEPTH too low!\n");
+                printk("BUG: MAX_LOCK_DEPTH too low, depth: %i  max: %lu!\n",
+                       curr->lockdep_depth, MAX_LOCK_DEPTH);
                printk("turning off the locking correctness validator.\n");
+                lockdep_print_held_locks(current);
+                debug_show_all_locks();
                dump_stack();
                return 0;
        }
@@ -3203,7 +3208,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 }
 static int
-print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
                           unsigned long ip)
 {
        if (!debug_locks_off())
@@ -3246,7 +3251,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
                return 0;
        if (curr->lockdep_depth <= 0)
-                return print_unlock_inbalance_bug(curr, lock, ip);
+                return print_unlock_imbalance_bug(curr, lock, ip);
        return 1;
 }
@@ -3317,7 +3322,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
                        goto found_it;
                prev_hlock = hlock;
        }
-        return print_unlock_inbalance_bug(curr, lock, ip);
+        return print_unlock_imbalance_bug(curr, lock, ip);
 found_it:
        lockdep_init_map(lock, name, key, 0);
@@ -3384,7 +3389,7 @@ lock_release_non_nested(struct task_struct *curr,
                        goto found_it;
                prev_hlock = hlock;
        }
-        return print_unlock_inbalance_bug(curr, lock, ip);
+        return print_unlock_imbalance_bug(curr, lock, ip);
 found_it:
        if (hlock->instance == lock)
@@ -4083,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
 }
 EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
-static void print_held_locks_bug(struct task_struct *curr)
+static void print_held_locks_bug(void)
 {
        if (!debug_locks_off())
                return;
@@ -4092,22 +4097,21 @@ static void print_held_locks_bug(struct task_struct *curr)
        printk("\n");
        printk("=====================================\n");
-        printk("[ BUG: lock held at task exit time! ]\n");
+        printk("[ BUG: %s/%d still has locks held! ]\n",
+               current->comm, task_pid_nr(current));
        print_kernel_ident();
        printk("-------------------------------------\n");
-        printk("%s/%d is exiting with locks still held!\n",
+        lockdep_print_held_locks(current);
-                curr->comm, task_pid_nr(curr));
-        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
        dump_stack();
 }
-void debug_check_no_locks_held(struct task_struct *task)
+void debug_check_no_locks_held(void)
 {
-        if (unlikely(task->lockdep_depth > 0))
+        if (unlikely(current->lockdep_depth > 0))
-                print_held_locks_bug(task);
+                print_held_locks_bug();
 }
+EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
 void debug_show_all_locks(void)
 {
diff --git a/kernel/module.c b/kernel/module.c
index eab08274ec9b..0925c9a71975 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -197,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod)
                return -ENOENT;
 }
-static inline void add_taint_module(struct module *mod, unsigned flag)
+static inline void add_taint_module(struct module *mod, unsigned flag,
+                                    enum lockdep_ok lockdep_ok)
 {
-        add_taint(flag);
+        add_taint(flag, lockdep_ok);
        mod->taints |= (1U << flag);
 }
@@ -727,7 +728,7 @@ static inline int try_force_unload(unsigned int flags)
 {
        int ret = (flags & O_TRUNC);
        if (ret)
-                add_taint(TAINT_FORCED_RMMOD);
+                add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
        return ret;
 }
 #else
@@ -1138,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason)
        if (!test_taint(TAINT_FORCED_MODULE))
                printk(KERN_WARNING "%s: %s: kernel tainted.\n",
                       mod->name, reason);
-        add_taint_module(mod, TAINT_FORCED_MODULE);
+        add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
        return 0;
 #else
        return -ENOEXEC;
@@ -2147,7 +2148,8 @@ static void set_license(struct module *mod, const char *license)
                if (!test_taint(TAINT_PROPRIETARY_MODULE))
                        printk(KERN_WARNING "%s: module license '%s' taints "
                                "kernel.\n", mod->name, license);
-                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+                add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+                                 LOCKDEP_NOW_UNRELIABLE);
        }
 }
@@ -2539,7 +2541,7 @@ static int copy_module_from_fd(int fd, struct load_info *info)
        if (err)
                goto out;
-        err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
+        err = vfs_getattr(&file->f_path, &stat);
        if (err)
                goto out;
@@ -2700,10 +2702,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
        }
        if (!get_modinfo(info, "intree"))
-                add_taint_module(mod, TAINT_OOT_MODULE);
+                add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
        if (get_modinfo(info, "staging")) {
-                add_taint_module(mod, TAINT_CRAP);
+                add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
                printk(KERN_WARNING "%s: module is from the staging directory,"
                       " the quality is unknown, you have been warned.\n",
                       mod->name);
@@ -2869,15 +2871,17 @@ static int check_module_license_and_versions(struct module *mod)
         * using GPL-only symbols it needs.
         */
        if (strcmp(mod->name, "ndiswrapper") == 0)
-                add_taint(TAINT_PROPRIETARY_MODULE);
+                add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
        /* driverloader was caught wrongly pretending to be under GPL */
        if (strcmp(mod->name, "driverloader") == 0)
-                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+                add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+                                 LOCKDEP_NOW_UNRELIABLE);
        /* lve claims to be GPL but upstream won't provide source */
        if (strcmp(mod->name, "lve") == 0)
-                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+                add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+                                 LOCKDEP_NOW_UNRELIABLE);
 #ifdef CONFIG_MODVERSIONS
        if ((mod->num_syms && !mod->crcs)
@@ -3141,12 +3145,72 @@ static int may_init_module(void)
        return 0;
 }
+/*
+ * We try to place it in the list now to make sure it's unique before
+ * we dedicate too many resources.  In particular, temporary percpu
+ * memory exhaustion.
+ */
+static int add_unformed_module(struct module *mod)
+{
+        int err;
+        struct module *old;
+        mod->state = MODULE_STATE_UNFORMED;
+again:
+        mutex_lock(&module_mutex);
+        if ((old = find_module_all(mod->name, true)) != NULL) {
+                if (old->state == MODULE_STATE_COMING
+                    || old->state == MODULE_STATE_UNFORMED) {
+                        /* Wait in case it fails to load. */
+                        mutex_unlock(&module_mutex);
+                        err = wait_event_interruptible(module_wq,
+                                               finished_loading(mod->name));
+                        if (err)
+                                goto out_unlocked;
+                        goto again;
+                }
+                err = -EEXIST;
+                goto out;
+        }
+        list_add_rcu(&mod->list, &modules);
+        err = 0;
+out:
+        mutex_unlock(&module_mutex);
+out_unlocked:
+        return err;
+}
+static int complete_formation(struct module *mod, struct load_info *info)
+{
+        int err;
+        mutex_lock(&module_mutex);
+        /* Find duplicate symbols (must be called under lock). */
+        err = verify_export_symbols(mod);
+        if (err < 0)
+                goto out;
+        /* This relies on module_mutex for list integrity. */
+        module_bug_finalize(info->hdr, info->sechdrs, mod);
+        /* Mark state as coming so strong_try_module_get() ignores us,
+         * but kallsyms etc. can see us. */
+        mod->state = MODULE_STATE_COMING;
+out:
+        mutex_unlock(&module_mutex);
+        return err;
+}
 /* Allocate and load the module: note that size of section 0 is always
   zero, and we rely on this for optional sections. */
 static int load_module(struct load_info *info, const char __user *uargs,
                       int flags)
 {
-        struct module *mod, *old;
+        struct module *mod;
        long err;
        err = module_sig_check(info);
@@ -3164,36 +3228,20 @@ static int load_module(struct load_info *info, const char __user *uargs,
                goto free_copy;
        }
-        /*
+        /* Reserve our place in the list. */
-         * We try to place it in the list now to make sure it's unique
+        err = add_unformed_module(mod);
-         * before we dedicate too many resources.  In particular,
+        if (err)
-         * temporary percpu memory exhaustion.
-         */
-        mod->state = MODULE_STATE_UNFORMED;
-again:
-        mutex_lock(&module_mutex);
-        if ((old = find_module_all(mod->name, true)) != NULL) {
-                if (old->state == MODULE_STATE_COMING
-                    || old->state == MODULE_STATE_UNFORMED) {
-                        /* Wait in case it fails to load. */
-                        mutex_unlock(&module_mutex);
-                        err = wait_event_interruptible(module_wq,
-                                               finished_loading(mod->name));
-                        if (err)
-                                goto free_module;
-                        goto again;
-                }
-                err = -EEXIST;
-                mutex_unlock(&module_mutex);
                goto free_module;
-        }
-        list_add_rcu(&mod->list, &modules);
-        mutex_unlock(&module_mutex);
 #ifdef CONFIG_MODULE_SIG
        mod->sig_ok = info->sig_ok;
-        if (!mod->sig_ok)
+        if (!mod->sig_ok) {
-                add_taint_module(mod, TAINT_FORCED_MODULE);
+                printk_once(KERN_NOTICE
+                            "%s: module verification failed: signature and/or"
+                            " required key missing - tainting kernel\n",
+                            mod->name);
+                add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
+        }
 #endif
        /* Now module is in final location, initialize linked lists, etc. */
@@ -3236,21 +3284,11 @@ again:
        dynamic_debug_setup(info->debug, info->num_debug);
-        mutex_lock(&module_mutex);
+        /* Finally it's fully formed, ready to start executing. */
-        /* Find duplicate symbols (must be called under lock). */
+        err = complete_formation(mod, info);
-        err = verify_export_symbols(mod);
+        if (err)
-        if (err < 0)
                goto ddebug_cleanup;
-        /* This relies on module_mutex for list integrity. */
-        module_bug_finalize(info->hdr, info->sechdrs, mod);
-        /* Mark state as coming so strong_try_module_get() ignores us,
-         * but kallsyms etc. can see us. */
-        mod->state = MODULE_STATE_COMING;
-        mutex_unlock(&module_mutex);
        /* Module is ready to execute: parsing args may do that. */
        err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
                         -32768, 32767, &ddebug_dyndbg_module_param_cb);
@@ -3274,8 +3312,8 @@ again:
        /* module_bug_cleanup needs module_mutex protection */
        mutex_lock(&module_mutex);
        module_bug_cleanup(mod);
- ddebug_cleanup:
        mutex_unlock(&module_mutex);
+ ddebug_cleanup:
        dynamic_debug_remove(info->debug);
        synchronize_sched();
        kfree(mod->args);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 78e2ecb20165..afc0456f227a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -153,8 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
                goto out;
        }
-        new_ns = create_new_namespaces(flags, tsk,
+        new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
-                                       task_cred_xxx(tsk, user_ns), tsk->fs);
        if (IS_ERR(new_ns)) {
                err = PTR_ERR(new_ns);
                goto out;
@@ -251,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
                return PTR_ERR(file);
        err = -EINVAL;
-        ei = PROC_I(file->f_dentry->d_inode);
+        ei = PROC_I(file_inode(file));
        ops = ei->ns_ops;
        if (nstype && (ops->type != nstype))
                goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index e1b2822fff97..7c57cc9eee2c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -259,26 +259,19 @@ unsigned long get_taint(void)
        return tainted_mask;
 }
-void add_taint(unsigned flag)
+/**
+ * add_taint: add a taint flag if not already set.
+ * @flag: one of the TAINT_* constants.
+ * @lockdep_ok: whether lock debugging is still OK.
+ *
+ * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for
+ * some notewortht-but-not-corrupting cases, it can be set to true.
+ */
+void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
 {
-        /*
+        if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
-         * Can't trust the integrity of the kernel anymore.
+                printk(KERN_WARNING
-         * We don't call directly debug_locks_off() because the issue
+                       "Disabling lock debugging due to kernel taint\n");
-         * is not necessarily serious enough to set oops_in_progress to 1
-         * Also we want to keep up lockdep for staging/out-of-tree
-         * development and post-warning case.
-         */
-        switch (flag) {
-        case TAINT_CRAP:
-        case TAINT_OOT_MODULE:
-        case TAINT_WARN:
-        case TAINT_FIRMWARE_WORKAROUND:
-                break;
-        default:
-                if (__debug_locks_off())
-                        printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
-        }
        set_bit(flag, &tainted_mask);
 }
@@ -421,7 +414,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
        print_modules();
        dump_stack();
        print_oops_end_marker();
-        add_taint(taint);
+        /* Just a warning, don't kill lockdep. */
+        add_taint(taint, LOCKDEP_STILL_OK);
 }
 void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
diff --git a/kernel/pid.c b/kernel/pid.c
index f2c6a6825098..047dc6264638 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -350,10 +350,9 @@ void disable_pid_allocation(struct pid_namespace *ns)
 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 {
-        struct hlist_node *elem;
        struct upid *pnr;
-        hlist_for_each_entry_rcu(pnr, elem,
+        hlist_for_each_entry_rcu(pnr,
                        &pid_hash[pid_hashfn(nr, ns)], pid_chain)
                if (pnr->nr == nr && pnr->ns == ns)
                        return container_of(pnr, struct pid,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 10349d5f2ec3..6edbb2c55c22 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -552,24 +552,22 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
                return -EAGAIN;
        spin_lock_init(&new_timer->it_lock);
- retry:
-        if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) {
+        idr_preload(GFP_KERNEL);
-                error = -EAGAIN;
-                goto out;
-        }
        spin_lock_irq(&idr_lock);
-        error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
+        error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
        spin_unlock_irq(&idr_lock);
-        if (error) {
+        idr_preload_end();
-                if (error == -EAGAIN)
+        if (error < 0) {
-                        goto retry;
                /*
                 * Weird looking, but we return EAGAIN if the IDR is
                 * full (proper POSIX return value for this)
                 */
-                error = -EAGAIN;
+                if (error == -ENOSPC)
+                        error = -EAGAIN;
                goto out;
        }
+        new_timer_id = error;
        it_id_set = IT_ID_SET;
        new_timer->it_id = (timer_t) new_timer_id;
@@ -639,6 +637,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 {
        struct k_itimer *timr;
+        /*
+         * timer_t could be any type >= int and we want to make sure any
+         * @timer_id outside positive int range fails lookup.
+         */
+        if ((unsigned long long)timer_id > INT_MAX)
+                return NULL;
        rcu_read_lock();
        timr = idr_find(&posix_timers_id, (int)timer_id);
        if (timr) {
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index ca304046d9e2..c6422ffeda9a 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend);
 void queue_up_suspend_work(void)
 {
-        if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
+        if (autosleep_state > PM_SUSPEND_ON)
                queue_work(autosleep_wq, &suspend_work);
 }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c16f9167de1..d77663bfedeb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
 static suspend_state_t decode_state(const char *buf, size_t n)
 {
 #ifdef CONFIG_SUSPEND
-        suspend_state_t state = PM_SUSPEND_STANDBY;
+        suspend_state_t state = PM_SUSPEND_MIN;
        const char * const *s;
 #endif
        char *p;
@@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match);
 #endif /* CONFIG_PM_TRACE */
+#ifdef CONFIG_FREEZER
+static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
+                                      struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%u\n", freeze_timeout_msecs);
+}
+static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
+                                       struct kobj_attribute *attr,
+                                       const char *buf, size_t n)
+{
+        unsigned long val;
+        if (kstrtoul(buf, 10, &val))
+                return -EINVAL;
+        freeze_timeout_msecs = val;
+        return n;
+}
+power_attr(pm_freeze_timeout);
+#endif  /* CONFIG_FREEZER*/
 static struct attribute * g[] = {
        &state_attr.attr,
 #ifdef CONFIG_PM_TRACE
@@ -576,6 +600,9 @@ static struct attribute * g[] = {
        &pm_print_times_attr.attr,
 #endif
 #endif
+#ifdef CONFIG_FREEZER
+        &pm_freeze_timeout_attr.attr,
+#endif
        NULL,
 };
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d5a258b60c6f..98088e0e71e8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -21,7 +21,7 @@
 /* 
 * Timeout for stopping processes
 */
-#define TIMEOUT (20 * HZ)
+unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
 static int try_to_freeze_tasks(bool user_only)
 {
@@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only)
        do_gettimeofday(&start);
-        end_time = jiffies + TIMEOUT;
+        end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
        if (!user_only)
                freeze_workqueues_begin();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9322ff7eaad6..587dddeebf15 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
                return;
        }
-        if (delayed_work_pending(&req->work))
+        cancel_delayed_work_sync(&req->work);
-                cancel_delayed_work_sync(&req->work);
        if (new_value != req->node.prio)
                pm_qos_update_target(
@@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
                 "%s called for unknown object.", __func__))
                return;
-        if (delayed_work_pending(&req->work))
+        cancel_delayed_work_sync(&req->work);
-                cancel_delayed_work_sync(&req->work);
        if (new_value != req->node.prio)
                pm_qos_update_target(
@@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
                return;
        }
-        if (delayed_work_pending(&req->work))
+        cancel_delayed_work_sync(&req->work);
-                cancel_delayed_work_sync(&req->work);
        pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
                             &req->node, PM_QOS_REMOVE_REQ,
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..d4feda084a3a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,12 +30,38 @@
 #include "power.h"
 const char *const pm_states[PM_SUSPEND_MAX] = {
+        [PM_SUSPEND_FREEZE]     = "freeze",
        [PM_SUSPEND_STANDBY]    = "standby",
        [PM_SUSPEND_MEM]        = "mem",
 };
 static const struct platform_suspend_ops *suspend_ops;
+static bool need_suspend_ops(suspend_state_t state)
+{
+        return !!(state > PM_SUSPEND_FREEZE);
+}
+static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static bool suspend_freeze_wake;
+static void freeze_begin(void)
+{
+        suspend_freeze_wake = false;
+}
+static void freeze_enter(void)
+{
+        wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+}
+void freeze_wake(void)
+{
+        suspend_freeze_wake = true;
+        wake_up(&suspend_freeze_wait_head);
+}
+EXPORT_SYMBOL_GPL(freeze_wake);
 /**
 * suspend_set_ops - Set the global suspend method table.
 * @ops: Suspend operations to use.
@@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
 bool valid_state(suspend_state_t state)
 {
+        if (state == PM_SUSPEND_FREEZE)
+                return true;
        /*
-         * All states need lowlevel support and need to be valid to the lowlevel
+         * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
+         * support and need to be valid to the lowlevel
         * implementation, no valid callback implies that none are valid.
         */
        return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
@@ -89,11 +118,11 @@ static int suspend_test(int level)
 * hibernation).  Run suspend notifiers, allocate the "suspend" console and
 * freeze processes.
 */
-static int suspend_prepare(void)
+static int suspend_prepare(suspend_state_t state)
 {
        int error;
-        if (!suspend_ops || !suspend_ops->enter)
+        if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
                return -EPERM;
        pm_prepare_console();
@@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 {
        int error;
-        if (suspend_ops->prepare) {
+        if (need_suspend_ops(state) && suspend_ops->prepare) {
                error = suspend_ops->prepare();
                if (error)
                        goto Platform_finish;
@@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
                goto Platform_finish;
        }
-        if (suspend_ops->prepare_late) {
+        if (need_suspend_ops(state) && suspend_ops->prepare_late) {
                error = suspend_ops->prepare_late();
                if (error)
                        goto Platform_wake;
        }
+        /*
+         * PM_SUSPEND_FREEZE equals
+         * frozen processes + suspended devices + idle processors.
+         * Thus we should invoke freeze_enter() soon after
+         * all the devices are suspended.
+         */
+        if (state == PM_SUSPEND_FREEZE) {
+                freeze_enter();
+                goto Platform_wake;
+        }
        if (suspend_test(TEST_PLATFORM))
                goto Platform_wake;
@@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        enable_nonboot_cpus();
 Platform_wake:
-        if (suspend_ops->wake)
+        if (need_suspend_ops(state) && suspend_ops->wake)
                suspend_ops->wake();
        dpm_resume_start(PMSG_RESUME);
 Platform_finish:
-        if (suspend_ops->finish)
+        if (need_suspend_ops(state) && suspend_ops->finish)
                suspend_ops->finish();
        return error;
@@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state)
        int error;
        bool wakeup = false;
-        if (!suspend_ops)
+        if (need_suspend_ops(state) && !suspend_ops)
                return -ENOSYS;
        trace_machine_suspend(state);
-        if (suspend_ops->begin) {
+        if (need_suspend_ops(state) && suspend_ops->begin) {
                error = suspend_ops->begin(state);
                if (error)
                        goto Close;
@@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        do {
                error = suspend_enter(state, &wakeup);
-        } while (!error && !wakeup
+        } while (!error && !wakeup && need_suspend_ops(state)
                && suspend_ops->suspend_again && suspend_ops->suspend_again());
 Resume_devices:
@@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state)
        ftrace_start();
        resume_console();
 Close:
-        if (suspend_ops->end)
+        if (need_suspend_ops(state) && suspend_ops->end)
                suspend_ops->end();
        trace_machine_suspend(PWR_EVENT_EXIT);
        return error;
 Recover_platform:
-        if (suspend_ops->recover)
+        if (need_suspend_ops(state) && suspend_ops->recover)
                suspend_ops->recover();
        goto Resume_devices;
 }
@@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state)
        if (!mutex_trylock(&pm_mutex))
                return -EBUSY;
+        if (state == PM_SUSPEND_FREEZE)
+                freeze_begin();
        printk(KERN_INFO "PM: Syncing filesystems ... ");
        sys_sync();
        printk("done.\n");
        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
-        error = suspend_prepare();
+        error = suspend_prepare(state);
        if (error)
                goto Unlock;
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 25596e450ac7..9b2a1d58558d 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
        rtc_set_alarm(rtc, &alm);
 }
-static int __init has_wakealarm(struct device *dev, void *name_ptr)
+static int __init has_wakealarm(struct device *dev, const void *data)
 {
        struct rtc_device *candidate = to_rtc_device(dev);
@@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
        if (!device_may_wakeup(candidate->dev.parent))
                return 0;
-        *(const char **)name_ptr = dev_name(dev);
        return 1;
 }
@@ -159,8 +158,8 @@ static int __init test_suspend(void)
        static char             warn_no_rtc[] __initdata =
                KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
-        char                    *pony = NULL;
        struct rtc_device       *rtc = NULL;
+        struct device           *dev;
        /* PM is initialized by now; is that state testable? */
        if (test_state == PM_SUSPEND_ON)
@@ -171,9 +170,9 @@ static int __init test_suspend(void)
        }
        /* RTCs have initialized by now too ... can we use one? */
-        class_find_device(rtc_class, NULL, &pony, has_wakealarm);
+        dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
-        if (pony)
+        if (dev)
-                rtc = rtc_class_open(pony);
+                rtc = rtc_class_open(dev_name(dev));
        if (!rtc) {
                printk(warn_no_rtc);
                goto done;
diff --git a/kernel/printk.c b/kernel/printk.c
index f24633afa46a..0b31715f335a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -88,6 +88,12 @@ static DEFINE_SEMAPHORE(console_sem);
 struct console *console_drivers;
 EXPORT_SYMBOL_GPL(console_drivers);
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map console_lock_dep_map = {
+        .name = "console_lock"
+};
+#endif
 /*
 * This is used for debugging the mess that is the VT code by
 * keeping track if we have the console semaphore held. It's
@@ -1919,6 +1925,7 @@ void console_lock(void)
                return;
        console_locked = 1;
        console_may_schedule = 1;
+        mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
 }
 EXPORT_SYMBOL(console_lock);
@@ -1940,6 +1947,7 @@ int console_trylock(void)
        }
        console_locked = 1;
        console_may_schedule = 0;
+        mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
        return 1;
 }
 EXPORT_SYMBOL(console_trylock);
@@ -2102,6 +2110,7 @@ skip:
                local_irq_restore(flags);
        }
        console_locked = 0;
+        mutex_release(&console_lock_dep_map, 1, _RET_IP_);
        /* Release the exclusive_console once it is used */
        if (unlikely(exclusive_console))
diff --git a/kernel/relay.c b/kernel/relay.c
index e8cd2027abbd..01ab081ac53a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1139,7 +1139,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
        if (!desc->count)
                return 0;
-        mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
+        mutex_lock(&file_inode(filp)->i_mutex);
        do {
                if (!relay_file_read_avail(buf, *ppos))
                        break;
@@ -1159,7 +1159,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
                        *ppos = relay_file_read_end_pos(buf, read_start, ret);
                }
        } while (desc->count && ret);
-        mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
+        mutex_unlock(&file_inode(filp)->i_mutex);
        return desc->written;
 }
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21076a3..64de5f8b0c9e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
        ag->tg->rt_se = NULL;
        ag->tg->rt_rq = NULL;
 #endif
+        sched_offline_group(ag->tg);
        sched_destroy_group(ag->tg);
 }
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
        if (IS_ERR(tg))
                goto out_free;
+        sched_online_group(tg, &root_task_group);
        kref_init(&ag->kref);
        init_rwsem(&ag->lock);
        ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 03d7784b7bd2..7f12624a393c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
 */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
-        const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+        int nid = cpu_to_node(cpu);
+        const struct cpumask *nodemask = NULL;
        enum { cpuset, possible, fail } state = cpuset;
        int dest_cpu;
-        /* Look for allowed, online CPU in same node. */
+        /*
-        for_each_cpu(dest_cpu, nodemask) {
+         * If the node that the cpu is on has been offlined, cpu_to_node()
-                if (!cpu_online(dest_cpu))
+         * will return -1. There is no cpu on the node, and we should
-                        continue;
+         * select the cpu on the other node.
-                if (!cpu_active(dest_cpu))
+         */
-                        continue;
+        if (nid != -1) {
-                if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+                nodemask = cpumask_of_node(nid);
-                        return dest_cpu;
+                /* Look for allowed, online CPU in same node. */
+                for_each_cpu(dest_cpu, nodemask) {
+                        if (!cpu_online(dest_cpu))
+                                continue;
+                        if (!cpu_active(dest_cpu))
+                                continue;
+                        if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+                                return dest_cpu;
+                }
        }
        for (;;) {
@@ -1742,9 +1752,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
        struct preempt_notifier *notifier;
-        struct hlist_node *node;
-        hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+        hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
                notifier->ops->sched_in(notifier, raw_smp_processor_id());
 }
@@ -1753,9 +1762,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
                                 struct task_struct *next)
 {
        struct preempt_notifier *notifier;
-        struct hlist_node *node;
-        hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+        hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
                notifier->ops->sched_out(notifier, next);
 }
@@ -1969,11 +1977,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
 }
 /*
- * nr_running, nr_uninterruptible and nr_context_switches:
+ * nr_running and nr_context_switches:
 *
 * externally visible scheduler statistics: current number of runnable
- * threads, current number of uninterruptible-sleeping threads, total
+ * threads, total number of context switches performed since bootup.
- * number of context switches performed since bootup.
 */
 unsigned long nr_running(void)
 {
@@ -1985,23 +1992,6 @@ unsigned long nr_running(void)
        return sum;
 }
-unsigned long nr_uninterruptible(void)
-{
-        unsigned long i, sum = 0;
-        for_each_possible_cpu(i)
-                sum += cpu_rq(i)->nr_uninterruptible;
-        /*
-         * Since we read the counters lockless, it might be slightly
-         * inaccurate. Do not allow it to go below zero though:
-         */
-        if (unlikely((long)sum < 0))
-                sum = 0;
-        return sum;
-}
 unsigned long long nr_context_switches(void)
 {
        int i;
@@ -2786,7 +2776,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
        if (irqs_disabled())
                print_irqtrace_events(prev);
        dump_stack();
-        add_taint(TAINT_WARN);
+        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 /*
@@ -3268,7 +3258,8 @@ void complete_all(struct completion *x)
 EXPORT_SYMBOL(complete_all);
 static inline long __sched
-do_wait_for_common(struct completion *x, long timeout, int state)
+do_wait_for_common(struct completion *x,
+                   long (*action)(long), long timeout, int state)
 {
        if (!x->done) {
                DECLARE_WAITQUEUE(wait, current);
@@ -3281,7 +3272,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
                        }
                        __set_current_state(state);
                        spin_unlock_irq(&x->wait.lock);
-                        timeout = schedule_timeout(timeout);
+                        timeout = action(timeout);
                        spin_lock_irq(&x->wait.lock);
                } while (!x->done && timeout);
                __remove_wait_queue(&x->wait, &wait);
@@ -3292,17 +3283,30 @@ do_wait_for_common(struct completion *x, long timeout, int state)
        return timeout ?: 1;
 }
-static long __sched
+static inline long __sched
-wait_for_common(struct completion *x, long timeout, int state)
+__wait_for_common(struct completion *x,
+                  long (*action)(long), long timeout, int state)
 {
        might_sleep();
        spin_lock_irq(&x->wait.lock);
-        timeout = do_wait_for_common(x, timeout, state);
+        timeout = do_wait_for_common(x, action, timeout, state);
        spin_unlock_irq(&x->wait.lock);
        return timeout;
 }
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+        return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+        return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
 /**
 * wait_for_completion: - waits for completion of a task
 * @x:  holds the state of this particular completion
@@ -3339,6 +3343,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 EXPORT_SYMBOL(wait_for_completion_timeout);
 /**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+        wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+        return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+/**
 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
 * @x:  holds the state of this particular completion
 *
@@ -4364,7 +4401,10 @@ EXPORT_SYMBOL(yield);
 * It's the caller's job to ensure that the target task struct
 * can't go away on us before we can do any checks.
 *
- * Returns true if we indeed boosted the target task.
+ * Returns:
+ *      true (>0) if we indeed boosted the target task.
+ *      false (0) if we failed to boost the target.
+ *      -ESRCH if there's no task to yield to.
 */
 bool __sched yield_to(struct task_struct *p, bool preempt)
 {
@@ -4378,6 +4418,15 @@ bool __sched yield_to(struct task_struct *p, bool preempt)
 again:
        p_rq = task_rq(p);
+        /*
+         * If we're the only runnable task on the rq and target rq also
+         * has only one task, there's absolutely no point in yielding.
+         */
+        if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+                yielded = -ESRCH;
+                goto out_irq;
+        }
        double_rq_lock(rq, p_rq);
        while (task_rq(p) != p_rq) {
                double_rq_unlock(rq, p_rq);
@@ -4385,13 +4434,13 @@ again:
        }
        if (!curr->sched_class->yield_to_task)
-                goto out;
+                goto out_unlock;
        if (curr->sched_class != p->sched_class)
-                goto out;
+                goto out_unlock;
        if (task_running(p_rq, p) || p->state)
-                goto out;
+                goto out_unlock;
        yielded = curr->sched_class->yield_to_task(rq, p, preempt);
        if (yielded) {
@@ -4404,11 +4453,12 @@ again:
                        resched_task(p_rq->curr);
        }
-out:
+out_unlock:
        double_rq_unlock(rq, p_rq);
+out_irq:
        local_irq_restore(flags);
-        if (yielded)
+        if (yielded > 0)
                schedule();
        return yielded;
@@ -7161,7 +7211,6 @@ static void free_sched_group(struct task_group *tg)
 struct task_group *sched_create_group(struct task_group *parent)
 {
        struct task_group *tg;
-        unsigned long flags;
        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
        if (!tg)
@@ -7173,6 +7222,17 @@ struct task_group *sched_create_group(struct task_group *parent)
        if (!alloc_rt_sched_group(tg, parent))
                goto err;
+        return tg;
+err:
+        free_sched_group(tg);
+        return ERR_PTR(-ENOMEM);
+}
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+        unsigned long flags;
        spin_lock_irqsave(&task_group_lock, flags);
        list_add_rcu(&tg->list, &task_groups);
@@ -7182,12 +7242,6 @@ struct task_group *sched_create_group(struct task_group *parent)
        INIT_LIST_HEAD(&tg->children);
        list_add_rcu(&tg->siblings, &parent->children);
        spin_unlock_irqrestore(&task_group_lock, flags);
-        return tg;
-err:
-        free_sched_group(tg);
-        return ERR_PTR(-ENOMEM);
 }
 /* rcu callback to free various structures associated with a task group */
@@ -7200,6 +7254,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
+        /* wait for possible concurrent references to cfs_rqs complete */
+        call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+void sched_offline_group(struct task_group *tg)
+{
        unsigned long flags;
        int i;
@@ -7211,9 +7271,6 @@ void sched_destroy_group(struct task_group *tg)
        list_del_rcu(&tg->list);
        list_del_rcu(&tg->siblings);
        spin_unlock_irqrestore(&task_group_lock, flags);
-        /* wait for possible concurrent references to cfs_rqs complete */
-        call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 /* change task's runqueue when it moves between groups.
@@ -7584,6 +7641,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
        return &tg->css;
 }
+static int cpu_cgroup_css_online(struct cgroup *cgrp)
+{
+        struct task_group *tg = cgroup_tg(cgrp);
+        struct task_group *parent;
+        if (!cgrp->parent)
+                return 0;
+        parent = cgroup_tg(cgrp->parent);
+        sched_online_group(tg, parent);
+        return 0;
+}
 static void cpu_cgroup_css_free(struct cgroup *cgrp)
 {
        struct task_group *tg = cgroup_tg(cgrp);
@@ -7591,6 +7661,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
        sched_destroy_group(tg);
 }
+static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+{
+        struct task_group *tg = cgroup_tg(cgrp);
+        sched_offline_group(tg);
+}
 static int cpu_cgroup_can_attach(struct cgroup *cgrp,
                                 struct cgroup_taskset *tset)
 {
@@ -7946,6 +8023,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .css_alloc      = cpu_cgroup_css_alloc,
        .css_free       = cpu_cgroup_css_free,
+        .css_online     = cpu_cgroup_css_online,
+        .css_offline    = cpu_cgroup_css_offline,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9857329ed280..ed12cbb135f4 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -604,7 +604,7 @@ static unsigned long long vtime_delta(struct task_struct *tsk)
 {
        unsigned long long clock;
-        clock = sched_clock();
+        clock = local_clock();
        if (clock < tsk->vtime_snap)
                return 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7ae4c4c5420e..75024a673520 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
        if (autogroup_path(tg, group_path, PATH_MAX))
                return group_path;
-        /*
-         * May be NULL if the underlying cgroup isn't fully-created yet
-         */
-        if (!tg->css.cgroup) {
-                group_path[0] = '\0';
-                return group_path;
-        }
        cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
        return group_path;
 }
@@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu)
        {
                unsigned int freq = cpu_khz ? : 1;
-                SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+                SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
                           cpu, freq / 1000, (freq % 1000));
        }
 #else
-        SEQ_printf(m, "\ncpu#%d\n", cpu);
+        SEQ_printf(m, "cpu#%d\n", cpu);
 #endif
 #define P(x)                                                            \
@@ -330,6 +323,7 @@ do {									\
        print_rq(m, rq, cpu);
        rcu_read_unlock();
        spin_unlock_irqrestore(&sched_debug_lock, flags);
+        SEQ_printf(m, "\n");
 }
 static const char *sched_tunable_scaling_names[] = {
@@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = {
        "linear"
 };
-static int sched_debug_show(struct seq_file *m, void *v)
+static void sched_debug_header(struct seq_file *m)
 {
        u64 ktime, sched_clk, cpu_clk;
        unsigned long flags;
-        int cpu;
        local_irq_save(flags);
        ktime = ktime_to_ns(ktime_get());
@@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
 #undef PN
 #undef P
-        SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+        SEQ_printf(m, "  .%-40s: %d (%s)\n",
+                "sysctl_sched_tunable_scaling",
                sysctl_sched_tunable_scaling,
                sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+        SEQ_printf(m, "\n");
+}
-        for_each_online_cpu(cpu)
+static int sched_debug_show(struct seq_file *m, void *v)
-                print_cpu(m, cpu);
+{
+        int cpu = (unsigned long)(v - 2);
-        SEQ_printf(m, "\n");
+        if (cpu != -1)
+                print_cpu(m, cpu);
+        else
+                sched_debug_header(m);
        return 0;
 }
 void sysrq_sched_debug_show(void)
 {
-        sched_debug_show(NULL, NULL);
+        int cpu;
+        sched_debug_header(NULL);
+        for_each_online_cpu(cpu)
+                print_cpu(NULL, cpu);
+}
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *sched_debug_start(struct seq_file *file, loff_t *offset)
+{
+        unsigned long n = *offset;
+        if (n == 0)
+                return (void *) 1;
+        n--;
+        if (n > 0)
+                n = cpumask_next(n - 1, cpu_online_mask);
+        else
+                n = cpumask_first(cpu_online_mask);
+        *offset = n + 1;
+        if (n < nr_cpu_ids)
+                return (void *)(unsigned long)(n + 2);
+        return NULL;
+}
+static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
+{
+        (*offset)++;
+        return sched_debug_start(file, offset);
+}
+static void sched_debug_stop(struct seq_file *file, void *data)
+{
+}
+static const struct seq_operations sched_debug_sops = {
+        .start = sched_debug_start,
+        .next = sched_debug_next,
+        .stop = sched_debug_stop,
+        .show = sched_debug_show,
+};
+static int sched_debug_release(struct inode *inode, struct file *file)
+{
+        seq_release(inode, file);
+        return 0;
 }
 static int sched_debug_open(struct inode *inode, struct file *filp)
 {
-        return single_open(filp, sched_debug_show, NULL);
+        int ret = 0;
+        ret = seq_open(filp, &sched_debug_sops);
+        return ret;
 }
 static const struct file_operations sched_debug_fops = {
        .open           = sched_debug_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = single_release,
+        .release        = sched_debug_release,
 };
 static int __init init_sched_debug_procfs(void)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 903ffa9e8872..e036eda1a9c9 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
        if (mask_str == NULL)
                return -ENOMEM;
-        seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+        if (v == (void *)1) {
-        seq_printf(seq, "timestamp %lu\n", jiffies);
+                seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
-        for_each_online_cpu(cpu) {
+                seq_printf(seq, "timestamp %lu\n", jiffies);
-                struct rq *rq = cpu_rq(cpu);
+        } else {
+                struct rq *rq;
 #ifdef CONFIG_SMP
                struct sched_domain *sd;
                int dcount = 0;
 #endif
+                cpu = (unsigned long)(v - 2);
+                rq = cpu_rq(cpu);
                /* runqueue-specific stats */
                seq_printf(seq,
@@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v)
        return 0;
 }
-static int schedstat_open(struct inode *inode, struct file *file)
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *schedstat_start(struct seq_file *file, loff_t *offset)
 {
-        unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+        unsigned long n = *offset;
-        char *buf = kmalloc(size, GFP_KERNEL);
-        struct seq_file *m;
-        int res;
-        if (!buf)
+        if (n == 0)
-                return -ENOMEM;
+                return (void *) 1;
-        res = single_open(file, show_schedstat, NULL);
-        if (!res) {
+        n--;
-                m = file->private_data;
-                m->buf = buf;
+        if (n > 0)
-                m->size = size;
+                n = cpumask_next(n - 1, cpu_online_mask);
-        } else
+        else
-                kfree(buf);
+                n = cpumask_first(cpu_online_mask);
-        return res;
+        *offset = n + 1;
+        if (n < nr_cpu_ids)
+                return (void *)(unsigned long)(n + 2);
+        return NULL;
+}
+static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
+{
+        (*offset)++;
+        return schedstat_start(file, offset);
+}
+static void schedstat_stop(struct seq_file *file, void *data)
+{
+}
+static const struct seq_operations schedstat_sops = {
+        .start = schedstat_start,
+        .next  = schedstat_next,
+        .stop  = schedstat_stop,
+        .show  = show_schedstat,
+};
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &schedstat_sops);
 }
+static int schedstat_release(struct inode *inode, struct file *file)
+{
+        return 0;
+};
 static const struct file_operations proc_schedstat_operations = {
        .open    = schedstat_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-        .release = single_release,
+        .release = schedstat_release,
 };
 static int __init proc_schedstat_init(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index 7f82adbad480..dd72567767d9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -485,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
                if (force_default || ka->sa.sa_handler != SIG_IGN)
                        ka->sa.sa_handler = SIG_DFL;
                ka->sa.sa_flags = 0;
+#ifdef __ARCH_HAS_SA_RESTORER
+                ka->sa.sa_restorer = NULL;
+#endif
                sigemptyset(&ka->sa.sa_mask);
                ka++;
        }
@@ -1157,11 +1160,11 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 static void print_fatal_signal(int signr)
 {
        struct pt_regs *regs = signal_pt_regs();
-        printk("%s/%d: potentially unexpected fatal signal %d.\n",
+        printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n",
                current->comm, task_pid_nr(current), signr);
 #if defined(__i386__) && !defined(__arch_um__)
-        printk("code at %08lx: ", regs->ip);
+        printk(KERN_INFO "code at %08lx: ", regs->ip);
        {
                int i;
                for (i = 0; i < 16; i++) {
@@ -1169,11 +1172,11 @@ static void print_fatal_signal(int signr)
                        if (get_user(insn, (unsigned char *)(regs->ip + i)))
                                break;
-                        printk("%02x ", insn);
+                        printk(KERN_CONT "%02x ", insn);
                }
        }
+        printk(KERN_CONT "\n");
 #endif
-        printk("\n");
        preempt_disable();
        show_regs(regs);
        preempt_enable();
@@ -2399,6 +2402,15 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
        tracehook_signal_handler(sig, info, ka, regs, stepping);
 }
+void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
+{
+        if (failed)
+                force_sigsegv(ksig->sig, current);
+        else
+                signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
+                        signal_pt_regs(), stepping);
+}
 /*
 * It could be that complete_signal() picked us to notify about the
 * group-wide signal. Other threads should be notified now to take
@@ -2616,40 +2628,95 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
        return 0;
 }
-long do_sigpending(void __user *set, unsigned long sigsetsize)
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
+                compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
 {
-        long error = -EINVAL;
+#ifdef __BIG_ENDIAN
-        sigset_t pending;
+        sigset_t old_set = current->blocked;
+        /* XXX: Don't preclude handling different sized sigset_t's.  */
+        if (sigsetsize != sizeof(sigset_t))
+                return -EINVAL;
+        if (nset) {
+                compat_sigset_t new32;
+                sigset_t new_set;
+                int error;
+                if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
+                        return -EFAULT;
+                sigset_from_compat(&new_set, &new32);
+                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+                error = sigprocmask(how, &new_set, NULL);
+                if (error)
+                        return error;
+        }
+        if (oset) {
+                compat_sigset_t old32;
+                sigset_to_compat(&old32, &old_set);
+                if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
+                        return -EFAULT;
+        }
+        return 0;
+#else
+        return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
+                                  (sigset_t __user *)oset, sigsetsize);
+#endif
+}
+#endif
+static int do_sigpending(void *set, unsigned long sigsetsize)
+{
        if (sigsetsize > sizeof(sigset_t))
-                goto out;
+                return -EINVAL;
        spin_lock_irq(&current->sighand->siglock);
-        sigorsets(&pending, &current->pending.signal,
+        sigorsets(set, &current->pending.signal,
                  &current->signal->shared_pending.signal);
        spin_unlock_irq(&current->sighand->siglock);
        /* Outside the lock because only this thread touches it.  */
-        sigandsets(&pending, &current->blocked, &pending);
+        sigandsets(set, &current->blocked, set);
+        return 0;
-        error = -EFAULT;
-        if (!copy_to_user(set, &pending, sigsetsize))
-                error = 0;
-out:
-        return error;
 }
 /**
 *  sys_rt_sigpending - examine a pending signal that has been raised
 *                      while blocked
- *  @set: stores pending signals
+ *  @uset: stores pending signals
 *  @sigsetsize: size of sigset_t type or larger
 */
-SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
+SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
 {
-        return do_sigpending(set, sigsetsize);
+        sigset_t set;
+        int err = do_sigpending(&set, sigsetsize);
+        if (!err && copy_to_user(uset, &set, sigsetsize))
+                err = -EFAULT;
+        return err;
+}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
+                compat_size_t, sigsetsize)
+{
+#ifdef __BIG_ENDIAN
+        sigset_t set;
+        int err = do_sigpending(&set, sigsetsize);
+        if (!err) {
+                compat_sigset_t set32;
+                sigset_to_compat(&set32, &set);
+                /* we can get here only if sigsetsize <= sizeof(set) */
+                if (copy_to_user(uset, &set32, sigsetsize))
+                        err = -EFAULT;
+        }
+        return err;
+#else
+        return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
+#endif
 }
+#endif
 #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
@@ -2927,6 +2994,23 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
        return do_tkill(0, pid, sig);
 }
+static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
+{
+        /* Not even root can pretend to send signals from the kernel.
+         * Nor can they impersonate a kill()/tgkill(), which adds source info.
+         */
+        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
+            (task_pid_vnr(current) != pid)) {
+                /* We used to allow any < 0 si_code */
+                WARN_ON_ONCE(info->si_code < 0);
+                return -EPERM;
+        }
+        info->si_signo = sig;
+        /* POSIX.1b doesn't mention process groups.  */
+        return kill_proc_info(sig, info, pid);
+}
 /**
 *  sys_rt_sigqueueinfo - send signal information to a signal
 *  @pid: the PID of the thread
@@ -2937,25 +3021,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
 {
        siginfo_t info;
        if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
                return -EFAULT;
+        return do_rt_sigqueueinfo(pid, sig, &info);
+}
-        /* Not even root can pretend to send signals from the kernel.
+#ifdef CONFIG_COMPAT
-         * Nor can they impersonate a kill()/tgkill(), which adds source info.
+COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
-         */
+                        compat_pid_t, pid,
-        if (info.si_code >= 0 || info.si_code == SI_TKILL) {
+                        int, sig,
-                /* We used to allow any < 0 si_code */
+                        struct compat_siginfo __user *, uinfo)
-                WARN_ON_ONCE(info.si_code < 0);
+{
-                return -EPERM;
+        siginfo_t info;
-        }
+        int ret = copy_siginfo_from_user32(&info, uinfo);
-        info.si_signo = sig;
+        if (unlikely(ret))
+                return ret;
-        /* POSIX.1b doesn't mention process groups.  */
+        return do_rt_sigqueueinfo(pid, sig, &info);
-        return kill_proc_info(sig, &info, pid);
 }
+#endif
-long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
 {
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
@@ -2964,7 +3049,8 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
        /* Not even root can pretend to send signals from the kernel.
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
-        if (info->si_code >= 0 || info->si_code == SI_TKILL) {
+        if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
+            (task_pid_vnr(current) != pid)) {
                /* We used to allow any < 0 si_code */
                WARN_ON_ONCE(info->si_code < 0);
                return -EPERM;
@@ -2985,6 +3071,21 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
 }
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
+                        compat_pid_t, tgid,
+                        compat_pid_t, pid,
+                        int, sig,
+                        struct compat_siginfo __user *, uinfo)
+{
+        siginfo_t info;
+        if (copy_siginfo_from_user32(&info, uinfo))
+                return -EFAULT;
+        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+#endif
 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
        struct task_struct *t = current;
@@ -3030,7 +3131,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
        return 0;
 }
-int 
+static int 
 do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
 {
        stack_t oss;
@@ -3095,12 +3196,10 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
 out:
        return error;
 }
-#ifdef CONFIG_GENERIC_SIGALTSTACK
 SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
 {
        return do_sigaltstack(uss, uoss, current_user_stack_pointer());
 }
-#endif
 int restore_altstack(const stack_t __user *uss)
 {
@@ -3118,7 +3217,6 @@ int __save_altstack(stack_t __user *uss, unsigned long sp)
 }
 #ifdef CONFIG_COMPAT
-#ifdef CONFIG_GENERIC_SIGALTSTACK
 COMPAT_SYSCALL_DEFINE2(sigaltstack,
                        const compat_stack_t __user *, uss_ptr,
                        compat_stack_t __user *, uoss_ptr)
@@ -3168,7 +3266,6 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
                __put_user(t->sas_ss_size, &uss->ss_size);
 }
 #endif
-#endif
 #ifdef __ARCH_WANT_SYS_SIGPENDING
@@ -3178,7 +3275,7 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
 */
 SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 {
-        return do_sigpending(set, sizeof(*set));
+        return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); 
 }
 #endif
@@ -3234,7 +3331,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
 }
 #endif /* __ARCH_WANT_SYS_SIGPROCMASK */
-#ifdef __ARCH_WANT_SYS_RT_SIGACTION
+#ifndef CONFIG_ODD_RT_SIGACTION
 /**
 *  sys_rt_sigaction - alter an action taken by a process
 *  @sig: signal to be sent
@@ -3268,7 +3365,132 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig,
 out:
        return ret;
 }
-#endif /* __ARCH_WANT_SYS_RT_SIGACTION */
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
+                const struct compat_sigaction __user *, act,
+                struct compat_sigaction __user *, oact,
+                compat_size_t, sigsetsize)
+{
+        struct k_sigaction new_ka, old_ka;
+        compat_sigset_t mask;
+#ifdef __ARCH_HAS_SA_RESTORER
+        compat_uptr_t restorer;
+#endif
+        int ret;
+        /* XXX: Don't preclude handling different sized sigset_t's.  */
+        if (sigsetsize != sizeof(compat_sigset_t))
+                return -EINVAL;
+        if (act) {
+                compat_uptr_t handler;
+                ret = get_user(handler, &act->sa_handler);
+                new_ka.sa.sa_handler = compat_ptr(handler);
+#ifdef __ARCH_HAS_SA_RESTORER
+                ret |= get_user(restorer, &act->sa_restorer);
+                new_ka.sa.sa_restorer = compat_ptr(restorer);
+#endif
+                ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
+                ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
+                if (ret)
+                        return -EFAULT;
+                sigset_from_compat(&new_ka.sa.sa_mask, &mask);
+        }
+        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+        if (!ret && oact) {
+                sigset_to_compat(&mask, &old_ka.sa.sa_mask);
+                ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
+                               &oact->sa_handler);
+                ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
+                ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+#ifdef __ARCH_HAS_SA_RESTORER
+                ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+                                &oact->sa_restorer);
+#endif
+        }
+        return ret;
+}
+#endif
+#endif /* !CONFIG_ODD_RT_SIGACTION */
+#ifdef CONFIG_OLD_SIGACTION
+SYSCALL_DEFINE3(sigaction, int, sig,
+                const struct old_sigaction __user *, act,
+                struct old_sigaction __user *, oact)
+{
+        struct k_sigaction new_ka, old_ka;
+        int ret;
+        if (act) {
+                old_sigset_t mask;
+                if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+                    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
+                    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
+                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+                    __get_user(mask, &act->sa_mask))
+                        return -EFAULT;
+#ifdef __ARCH_HAS_KA_RESTORER
+                new_ka.ka_restorer = NULL;
+#endif
+                siginitset(&new_ka.sa.sa_mask, mask);
+        }
+        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+        if (!ret && oact) {
+                if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+                    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
+                    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
+                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+                        return -EFAULT;
+        }
+        return ret;
+}
+#endif
+#ifdef CONFIG_COMPAT_OLD_SIGACTION
+COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
+                const struct compat_old_sigaction __user *, act,
+                struct compat_old_sigaction __user *, oact)
+{
+        struct k_sigaction new_ka, old_ka;
+        int ret;
+        compat_old_sigset_t mask;
+        compat_uptr_t handler, restorer;
+        if (act) {
+                if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+                    __get_user(handler, &act->sa_handler) ||
+                    __get_user(restorer, &act->sa_restorer) ||
+                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+                    __get_user(mask, &act->sa_mask))
+                        return -EFAULT;
+#ifdef __ARCH_HAS_KA_RESTORER
+                new_ka.ka_restorer = NULL;
+#endif
+                new_ka.sa.sa_handler = compat_ptr(handler);
+                new_ka.sa.sa_restorer = compat_ptr(restorer);
+                siginitset(&new_ka.sa.sa_mask, mask);
+        }
+        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+        if (!ret && oact) {
+                if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+                    __put_user(ptr_to_compat(old_ka.sa.sa_handler),
+                               &oact->sa_handler) ||
+                    __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+                               &oact->sa_restorer) ||
+                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+                        return -EFAULT;
+        }
+        return ret;
+}
+#endif
 #ifdef __ARCH_WANT_SYS_SGETMASK
@@ -3336,7 +3558,6 @@ int sigsuspend(sigset_t *set)
        return -ERESTARTNOHAND;
 }
-#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
 /**
 *  sys_rt_sigsuspend - replace the signal mask for a value with the
 *      @unewset value until a signal is received
@@ -3355,7 +3576,45 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
                return -EFAULT;
        return sigsuspend(&newset);
 }
-#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
+ 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
+{
+#ifdef __BIG_ENDIAN
+        sigset_t newset;
+        compat_sigset_t newset32;
+        /* XXX: Don't preclude handling different sized sigset_t's.  */
+        if (sigsetsize != sizeof(sigset_t))
+                return -EINVAL;
+        if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+                return -EFAULT;
+        sigset_from_compat(&newset, &newset32);
+        return sigsuspend(&newset);
+#else
+        /* on little-endian bitmaps don't care about granularity */
+        return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
+#endif
+}
+#endif
+#ifdef CONFIG_OLD_SIGSUSPEND
+SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
+{
+        sigset_t blocked;
+        siginitset(&blocked, mask);
+        return sigsuspend(&blocked);
+}
+#endif
+#ifdef CONFIG_OLD_SIGSUSPEND3
+SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
+{
+        sigset_t blocked;
+        siginitset(&blocked, mask);
+        return sigsuspend(&blocked);
+}
+#endif
 __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
 {
diff --git a/kernel/smp.c b/kernel/smp.c
index 69f38bd98b42..8e451f3ff51b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,22 +16,12 @@
 #include "smpboot.h"
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
-static struct {
-        struct list_head        queue;
-        raw_spinlock_t          lock;
-} call_function __cacheline_aligned_in_smp =
-        {
-                .queue          = LIST_HEAD_INIT(call_function.queue),
-                .lock           = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
-        };
 enum {
        CSD_FLAG_LOCK           = 0x01,
 };
 struct call_function_data {
-        struct call_single_data csd;
+        struct call_single_data __percpu *csd;
-        atomic_t                refs;
        cpumask_var_t           cpumask;
        cpumask_var_t           cpumask_ipi;
 };
@@ -60,6 +50,11 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
                if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
                                cpu_to_node(cpu)))
                        return notifier_from_errno(-ENOMEM);
+                cfd->csd = alloc_percpu(struct call_single_data);
+                if (!cfd->csd) {
+                        free_cpumask_var(cfd->cpumask);
+                        return notifier_from_errno(-ENOMEM);
+                }
                break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -70,6 +65,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
        case CPU_DEAD_FROZEN:
                free_cpumask_var(cfd->cpumask);
                free_cpumask_var(cfd->cpumask_ipi);
+                free_percpu(cfd->csd);
                break;
 #endif
        };
@@ -171,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 }
 /*
- * Invoked by arch to handle an IPI for call function. Must be called with
- * interrupts disabled.
- */
-void generic_smp_call_function_interrupt(void)
-{
-        struct call_function_data *data;
-        int cpu = smp_processor_id();
-        /*
-         * Shouldn't receive this interrupt on a cpu that is not yet online.
-         */
-        WARN_ON_ONCE(!cpu_online(cpu));
-        /*
-         * Ensure entry is visible on call_function_queue after we have
-         * entered the IPI. See comment in smp_call_function_many.
-         * If we don't have this, then we may miss an entry on the list
-         * and never get another IPI to process it.
-         */
-        smp_mb();
-        /*
-         * It's ok to use list_for_each_rcu() here even though we may
-         * delete 'pos', since list_del_rcu() doesn't clear ->next
-         */
-        list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
-                int refs;
-                smp_call_func_t func;
-                /*
-                 * Since we walk the list without any locks, we might
-                 * see an entry that was completed, removed from the
-                 * list and is in the process of being reused.
-                 *
-                 * We must check that the cpu is in the cpumask before
-                 * checking the refs, and both must be set before
-                 * executing the callback on this cpu.
-                 */
-                if (!cpumask_test_cpu(cpu, data->cpumask))
-                        continue;
-                smp_rmb();
-                if (atomic_read(&data->refs) == 0)
-                        continue;
-                func = data->csd.func;          /* save for later warn */
-                func(data->csd.info);
-                /*
-                 * If the cpu mask is not still set then func enabled
-                 * interrupts (BUG), and this cpu took another smp call
-                 * function interrupt and executed func(info) twice
-                 * on this cpu.  That nested execution decremented refs.
-                 */
-                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
-                        WARN(1, "%pf enabled interrupts and double executed\n", func);
-                        continue;
-                }
-                refs = atomic_dec_return(&data->refs);
-                WARN_ON(refs < 0);
-                if (refs)
-                        continue;
-                WARN_ON(!cpumask_empty(data->cpumask));
-                raw_spin_lock(&call_function.lock);
-                list_del_rcu(&data->csd.list);
-                raw_spin_unlock(&call_function.lock);
-                csd_unlock(&data->csd);
-        }
-}
-/*
 * Invoked by arch to handle an IPI for call function single. Must be
 * called from the arch with interrupts disabled.
 */
@@ -453,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait)
 {
        struct call_function_data *data;
-        unsigned long flags;
+        int cpu, next_cpu, this_cpu = smp_processor_id();
-        int refs, cpu, next_cpu, this_cpu = smp_processor_id();
        /*
         * Can deadlock when called with interrupts disabled.
@@ -486,50 +402,13 @@ void smp_call_function_many(const struct cpumask *mask,
        }
        data = &__get_cpu_var(cfd_data);
-        csd_lock(&data->csd);
-        /* This BUG_ON verifies our reuse assertions and can be removed */
-        BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
-        /*
-         * The global call function queue list add and delete are protected
-         * by a lock, but the list is traversed without any lock, relying
-         * on the rcu list add and delete to allow safe concurrent traversal.
-         * We reuse the call function data without waiting for any grace
-         * period after some other cpu removes it from the global queue.
-         * This means a cpu might find our data block as it is being
-         * filled out.
-         *
-         * We hold off the interrupt handler on the other cpu by
-         * ordering our writes to the cpu mask vs our setting of the
-         * refs counter.  We assert only the cpu owning the data block
-         * will set a bit in cpumask, and each bit will only be cleared
-         * by the subject cpu.  Each cpu must first find its bit is
-         * set and then check that refs is set indicating the element is
-         * ready to be processed, otherwise it must skip the entry.
-         *
-         * On the previous iteration refs was set to 0 by another cpu.
-         * To avoid the use of transitivity, set the counter to 0 here
-         * so the wmb will pair with the rmb in the interrupt handler.
-         */
-        atomic_set(&data->refs, 0);     /* convert 3rd to 1st party write */
-        data->csd.func = func;
-        data->csd.info = info;
-        /* Ensure 0 refs is visible before mask.  Also orders func and info */
-        smp_wmb();
-        /* We rely on the "and" being processed before the store */
        cpumask_and(data->cpumask, mask, cpu_online_mask);
        cpumask_clear_cpu(this_cpu, data->cpumask);
-        refs = cpumask_weight(data->cpumask);
        /* Some callers race with other cpus changing the passed mask */
-        if (unlikely(!refs)) {
+        if (unlikely(!cpumask_weight(data->cpumask)))
-                csd_unlock(&data->csd);
                return;
-        }
        /*
         * After we put an entry into the list, data->cpumask
@@ -537,34 +416,32 @@ void smp_call_function_many(const struct cpumask *mask,
         * a SMP function call, so data->cpumask will be zero.
         */
        cpumask_copy(data->cpumask_ipi, data->cpumask);
-        raw_spin_lock_irqsave(&call_function.lock, flags);
-        /*
-         * Place entry at the _HEAD_ of the list, so that any cpu still
-         * observing the entry in generic_smp_call_function_interrupt()
-         * will not miss any other list entries:
-         */
-        list_add_rcu(&data->csd.list, &call_function.queue);
-        /*
-         * We rely on the wmb() in list_add_rcu to complete our writes
-         * to the cpumask before this write to refs, which indicates
-         * data is on the list and is ready to be processed.
-         */
-        atomic_set(&data->refs, refs);
-        raw_spin_unlock_irqrestore(&call_function.lock, flags);
-        /*
+        for_each_cpu(cpu, data->cpumask) {
-         * Make the list addition visible before sending the ipi.
+                struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
-         * (IPIs must obey or appear to obey normal Linux cache
+                struct call_single_queue *dst =
-         * coherency rules -- see comment in generic_exec_single).
+                                        &per_cpu(call_single_queue, cpu);
-         */
+                unsigned long flags;
-        smp_mb();
+                csd_lock(csd);
+                csd->func = func;
+                csd->info = info;
+                raw_spin_lock_irqsave(&dst->lock, flags);
+                list_add_tail(&csd->list, &dst->list);
+                raw_spin_unlock_irqrestore(&dst->lock, flags);
+        }
        /* Send a message to all CPUs in the map */
        arch_send_call_function_ipi_mask(data->cpumask_ipi);
-        /* Optionally wait for the CPUs to complete */
+        if (wait) {
-        if (wait)
+                for_each_cpu(cpu, data->cpumask) {
-                csd_lock_wait(&data->csd);
+                        struct call_single_data *csd =
+                                        per_cpu_ptr(data->csd, cpu);
+                        csd_lock_wait(csd);
+                }
+        }
 }
 EXPORT_SYMBOL(smp_call_function_many);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d4abac261779..8eaed9aa9cf0 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -209,6 +209,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
 {
        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+        if (ht->pre_unpark)
+                ht->pre_unpark(cpu);
        kthread_unpark(tsk);
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f5cc25f147a6..14d7758074aa 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip)
 EXPORT_SYMBOL(local_bh_enable_ip);
 /*
- * We restart softirq processing MAX_SOFTIRQ_RESTART times,
+ * We restart softirq processing for at most 2 ms,
- * and we fall back to softirqd after that.
+ * and if need_resched() is not set.
 *
- * This number has been established via experimentation.
+ * These limits have been established via experimentation.
 * The two things to balance is latency against fairness -
 * we want to handle softirqs as soon as possible, but they
 * should not be able to lock up the box.
 */
-#define MAX_SOFTIRQ_RESTART 10
+#define MAX_SOFTIRQ_TIME  msecs_to_jiffies(2)
 asmlinkage void __do_softirq(void)
 {
        struct softirq_action *h;
        __u32 pending;
-        int max_restart = MAX_SOFTIRQ_RESTART;
+        unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
        int cpu;
        unsigned long old_flags = current->flags;
@@ -264,11 +264,12 @@ restart:
        local_irq_disable();
        pending = local_softirq_pending();
-        if (pending && --max_restart)
+        if (pending) {
-                goto restart;
+                if (time_before(jiffies, end) && !need_resched())
+                        goto restart;
-        if (pending)
                wakeup_softirqd();
+        }
        lockdep_softirq_exit();
@@ -322,18 +323,10 @@ void irq_enter(void)
 static inline void invoke_softirq(void)
 {
-        if (!force_irqthreads) {
+        if (!force_irqthreads)
-#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
                __do_softirq();
-#else
+        else
-                do_softirq();
-#endif
-        } else {
-                __local_bh_disable((unsigned long)__builtin_return_address(0),
-                                SOFTIRQ_OFFSET);
                wakeup_softirqd();
-                __local_bh_enable(SOFTIRQ_OFFSET);
-        }
 }
 /*
@@ -341,9 +334,15 @@ static inline void invoke_softirq(void)
 */
 void irq_exit(void)
 {
+#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
+        local_irq_disable();
+#else
+        WARN_ON_ONCE(!irqs_disabled());
+#endif
        account_irq_exit_time(current);
        trace_hardirq_exit();
-        sub_preempt_count(IRQ_EXIT_OFFSET);
+        sub_preempt_count(HARDIRQ_OFFSET);
        if (!in_interrupt() && local_softirq_pending())
                invoke_softirq();
@@ -353,7 +352,6 @@ void irq_exit(void)
                tick_nohz_irq_exit();
 #endif
        rcu_irq_exit();
-        sched_preempt_enable_no_resched();
 }
 /*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 95d178c62d5a..c09f2955ae30 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -336,7 +336,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {
        .create                 = cpu_stop_create,
        .setup                  = cpu_stop_unpark,
        .park                   = cpu_stop_park,
-        .unpark                 = cpu_stop_unpark,
+        .pre_unpark             = cpu_stop_unpark,
        .selfparking            = true,
 };
diff --git a/kernel/sys.c b/kernel/sys.c
index 265b37690421..81f56445fba9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -47,6 +47,7 @@
 #include <linux/syscalls.h>
 #include <linux/kprobes.h>
 #include <linux/user_namespace.h>
+#include <linux/binfmts.h>
 #include <linux/kmsg_dump.h>
 /* Move somewhere else to avoid recompiling? */
@@ -433,11 +434,12 @@ static DEFINE_MUTEX(reboot_mutex);
 SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                void __user *, arg)
 {
+        struct pid_namespace *pid_ns = task_active_pid_ns(current);
        char buffer[256];
        int ret = 0;
        /* We only trust the superuser with rebooting the system. */
-        if (!capable(CAP_SYS_BOOT))
+        if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
                return -EPERM;
        /* For safety, we require "magic" arguments. */
@@ -453,7 +455,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
         * pid_namespace, the command is handled by reboot_pid_ns() which will
         * call do_exit().
         */
-        ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
+        ret = reboot_pid_ns(pid_ns, cmd);
        if (ret)
                return ret;
@@ -1792,14 +1794,14 @@ SYSCALL_DEFINE1(umask, int, mask)
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
        struct fd exe;
-        struct dentry *dentry;
+        struct inode *inode;
        int err;
        exe = fdget(fd);
        if (!exe.file)
                return -EBADF;
-        dentry = exe.file->f_path.dentry;
+        inode = file_inode(exe.file);
        /*
         * Because the original mm->exe_file points to executable file, make
@@ -1807,11 +1809,11 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
         * overall picture.
         */
        err = -EACCES;
-        if (!S_ISREG(dentry->d_inode->i_mode)   ||
+        if (!S_ISREG(inode->i_mode)     ||
            exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
                goto exit;
-        err = inode_permission(dentry->d_inode, MAY_EXEC);
+        err = inode_permission(inode, MAY_EXEC);
        if (err)
                goto exit;
@@ -2012,160 +2014,159 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
        error = 0;
        switch (option) {
-                case PR_SET_PDEATHSIG:
+        case PR_SET_PDEATHSIG:
-                        if (!valid_signal(arg2)) {
+                if (!valid_signal(arg2)) {
-                                error = -EINVAL;
+                        error = -EINVAL;
-                                break;
-                        }
-                        me->pdeath_signal = arg2;
-                        break;
-                case PR_GET_PDEATHSIG:
-                        error = put_user(me->pdeath_signal, (int __user *)arg2);
-                        break;
-                case PR_GET_DUMPABLE:
-                        error = get_dumpable(me->mm);
                        break;
-                case PR_SET_DUMPABLE:
+                }
-                        if (arg2 < 0 || arg2 > 1) {
+                me->pdeath_signal = arg2;
-                                error = -EINVAL;
+                break;
-                                break;
+        case PR_GET_PDEATHSIG:
-                        }
+                error = put_user(me->pdeath_signal, (int __user *)arg2);
-                        set_dumpable(me->mm, arg2);
+                break;
+        case PR_GET_DUMPABLE:
+                error = get_dumpable(me->mm);
+                break;
+        case PR_SET_DUMPABLE:
+                if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
+                        error = -EINVAL;
                        break;
+                }
+                set_dumpable(me->mm, arg2);
+                break;
-                case PR_SET_UNALIGN:
+        case PR_SET_UNALIGN:
-                        error = SET_UNALIGN_CTL(me, arg2);
+                error = SET_UNALIGN_CTL(me, arg2);
-                        break;
+                break;
-                case PR_GET_UNALIGN:
+        case PR_GET_UNALIGN:
-                        error = GET_UNALIGN_CTL(me, arg2);
+                error = GET_UNALIGN_CTL(me, arg2);
-                        break;
+                break;
-                case PR_SET_FPEMU:
+        case PR_SET_FPEMU:
-                        error = SET_FPEMU_CTL(me, arg2);
+                error = SET_FPEMU_CTL(me, arg2);
-                        break;
+                break;
-                case PR_GET_FPEMU:
+        case PR_GET_FPEMU:
-                        error = GET_FPEMU_CTL(me, arg2);
+                error = GET_FPEMU_CTL(me, arg2);
-                        break;
+                break;
-                case PR_SET_FPEXC:
+        case PR_SET_FPEXC:
-                        error = SET_FPEXC_CTL(me, arg2);
+                error = SET_FPEXC_CTL(me, arg2);
-                        break;
+                break;
-                case PR_GET_FPEXC:
+        case PR_GET_FPEXC:
-                        error = GET_FPEXC_CTL(me, arg2);
+                error = GET_FPEXC_CTL(me, arg2);
-                        break;
+                break;
-                case PR_GET_TIMING:
+        case PR_GET_TIMING:
-                        error = PR_TIMING_STATISTICAL;
+                error = PR_TIMING_STATISTICAL;
-                        break;
+                break;
-                case PR_SET_TIMING:
+        case PR_SET_TIMING:
-                        if (arg2 != PR_TIMING_STATISTICAL)
+                if (arg2 != PR_TIMING_STATISTICAL)
-                                error = -EINVAL;
+                        error = -EINVAL;
-                        break;
+                break;
-                case PR_SET_NAME:
+        case PR_SET_NAME:
-                        comm[sizeof(me->comm)-1] = 0;
+                comm[sizeof(me->comm) - 1] = 0;
-                        if (strncpy_from_user(comm, (char __user *)arg2,
+                if (strncpy_from_user(comm, (char __user *)arg2,
-                                              sizeof(me->comm) - 1) < 0)
+                                      sizeof(me->comm) - 1) < 0)
-                                return -EFAULT;
+                        return -EFAULT;
-                        set_task_comm(me, comm);
+                set_task_comm(me, comm);
-                        proc_comm_connector(me);
+                proc_comm_connector(me);
-                        break;
+                break;
-                case PR_GET_NAME:
+        case PR_GET_NAME:
-                        get_task_comm(comm, me);
+                get_task_comm(comm, me);
-                        if (copy_to_user((char __user *)arg2, comm,
+                if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
-                                         sizeof(comm)))
+                        return -EFAULT;
-                                return -EFAULT;
+                break;
-                        break;
+        case PR_GET_ENDIAN:
-                case PR_GET_ENDIAN:
+                error = GET_ENDIAN(me, arg2);
-                        error = GET_ENDIAN(me, arg2);
+                break;
-                        break;
+        case PR_SET_ENDIAN:
-                case PR_SET_ENDIAN:
+                error = SET_ENDIAN(me, arg2);
-                        error = SET_ENDIAN(me, arg2);
+                break;
-                        break;
+        case PR_GET_SECCOMP:
-                case PR_GET_SECCOMP:
+                error = prctl_get_seccomp();
-                        error = prctl_get_seccomp();
+                break;
-                        break;
+        case PR_SET_SECCOMP:
-                case PR_SET_SECCOMP:
+                error = prctl_set_seccomp(arg2, (char __user *)arg3);
-                        error = prctl_set_seccomp(arg2, (char __user *)arg3);
+                break;
-                        break;
+        case PR_GET_TSC:
-                case PR_GET_TSC:
+                error = GET_TSC_CTL(arg2);
-                        error = GET_TSC_CTL(arg2);
+                break;
-                        break;
+        case PR_SET_TSC:
-                case PR_SET_TSC:
+                error = SET_TSC_CTL(arg2);
-                        error = SET_TSC_CTL(arg2);
+                break;
-                        break;
+        case PR_TASK_PERF_EVENTS_DISABLE:
-                case PR_TASK_PERF_EVENTS_DISABLE:
+                error = perf_event_task_disable();
-                        error = perf_event_task_disable();
+                break;
-                        break;
+        case PR_TASK_PERF_EVENTS_ENABLE:
-                case PR_TASK_PERF_EVENTS_ENABLE:
+                error = perf_event_task_enable();
-                        error = perf_event_task_enable();
+                break;
-                        break;
+        case PR_GET_TIMERSLACK:
-                case PR_GET_TIMERSLACK:
+                error = current->timer_slack_ns;
-                        error = current->timer_slack_ns;
+                break;
-                        break;
+        case PR_SET_TIMERSLACK:
-                case PR_SET_TIMERSLACK:
+                if (arg2 <= 0)
-                        if (arg2 <= 0)
+                        current->timer_slack_ns =
-                                current->timer_slack_ns =
                                        current->default_timer_slack_ns;
-                        else
+                else
-                                current->timer_slack_ns = arg2;
+                        current->timer_slack_ns = arg2;
-                        break;
+                break;
-                case PR_MCE_KILL:
+        case PR_MCE_KILL:
-                        if (arg4 | arg5)
+                if (arg4 | arg5)
-                                return -EINVAL;
+                        return -EINVAL;
-                        switch (arg2) {
+                switch (arg2) {
-                        case PR_MCE_KILL_CLEAR:
+                case PR_MCE_KILL_CLEAR:
-                                if (arg3 != 0)
+                        if (arg3 != 0)
-                                        return -EINVAL;
-                                current->flags &= ~PF_MCE_PROCESS;
-                                break;
-                        case PR_MCE_KILL_SET:
-                                current->flags |= PF_MCE_PROCESS;
-                                if (arg3 == PR_MCE_KILL_EARLY)
-                                        current->flags |= PF_MCE_EARLY;
-                                else if (arg3 == PR_MCE_KILL_LATE)
-                                        current->flags &= ~PF_MCE_EARLY;
-                                else if (arg3 == PR_MCE_KILL_DEFAULT)
-                                        current->flags &=
-                                                ~(PF_MCE_EARLY|PF_MCE_PROCESS);
-                                else
-                                        return -EINVAL;
-                                break;
-                        default:
                                return -EINVAL;
-                        }
+                        current->flags &= ~PF_MCE_PROCESS;
                        break;
-                case PR_MCE_KILL_GET:
+                case PR_MCE_KILL_SET:
-                        if (arg2 | arg3 | arg4 | arg5)
+                        current->flags |= PF_MCE_PROCESS;
-                                return -EINVAL;
+                        if (arg3 == PR_MCE_KILL_EARLY)
-                        if (current->flags & PF_MCE_PROCESS)
+                                current->flags |= PF_MCE_EARLY;
-                                error = (current->flags & PF_MCE_EARLY) ?
+                        else if (arg3 == PR_MCE_KILL_LATE)
-                                        PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+                                current->flags &= ~PF_MCE_EARLY;
+                        else if (arg3 == PR_MCE_KILL_DEFAULT)
+                                current->flags &=
+                                                ~(PF_MCE_EARLY|PF_MCE_PROCESS);
                        else
-                                error = PR_MCE_KILL_DEFAULT;
-                        break;
-                case PR_SET_MM:
-                        error = prctl_set_mm(arg2, arg3, arg4, arg5);
-                        break;
-                case PR_GET_TID_ADDRESS:
-                        error = prctl_get_tid_address(me, (int __user **)arg2);
-                        break;
-                case PR_SET_CHILD_SUBREAPER:
-                        me->signal->is_child_subreaper = !!arg2;
-                        break;
-                case PR_GET_CHILD_SUBREAPER:
-                        error = put_user(me->signal->is_child_subreaper,
-                                         (int __user *) arg2);
-                        break;
-                case PR_SET_NO_NEW_PRIVS:
-                        if (arg2 != 1 || arg3 || arg4 || arg5)
                                return -EINVAL;
-                        current->no_new_privs = 1;
                        break;
-                case PR_GET_NO_NEW_PRIVS:
-                        if (arg2 || arg3 || arg4 || arg5)
-                                return -EINVAL;
-                        return current->no_new_privs ? 1 : 0;
                default:
-                        error = -EINVAL;
+                        return -EINVAL;
-                        break;
+                }
+                break;
+        case PR_MCE_KILL_GET:
+                if (arg2 | arg3 | arg4 | arg5)
+                        return -EINVAL;
+                if (current->flags & PF_MCE_PROCESS)
+                        error = (current->flags & PF_MCE_EARLY) ?
+                                PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+                else
+                        error = PR_MCE_KILL_DEFAULT;
+                break;
+        case PR_SET_MM:
+                error = prctl_set_mm(arg2, arg3, arg4, arg5);
+                break;
+        case PR_GET_TID_ADDRESS:
+                error = prctl_get_tid_address(me, (int __user **)arg2);
+                break;
+        case PR_SET_CHILD_SUBREAPER:
+                me->signal->is_child_subreaper = !!arg2;
+                break;
+        case PR_GET_CHILD_SUBREAPER:
+                error = put_user(me->signal->is_child_subreaper,
+                                 (int __user *)arg2);
+                break;
+        case PR_SET_NO_NEW_PRIVS:
+                if (arg2 != 1 || arg3 || arg4 || arg5)
+                        return -EINVAL;
+                current->no_new_privs = 1;
+                break;
+        case PR_GET_NO_NEW_PRIVS:
+                if (arg2 || arg3 || arg4 || arg5)
+                        return -EINVAL;
+                return current->no_new_privs ? 1 : 0;
+        default:
+                error = -EINVAL;
+                break;
        }
        return error;
 }
@@ -2184,11 +2185,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
 char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
-static void argv_cleanup(struct subprocess_info *info)
-{
-        argv_free(info->argv);
-}
 static int __orderly_poweroff(void)
 {
        int argc;
@@ -2208,9 +2204,8 @@ static int __orderly_poweroff(void)
        }
        ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
-                                      NULL, argv_cleanup, NULL);
+                                      NULL, NULL, NULL);
-        if (ret == -ENOMEM)
+        argv_free(argv);
-                argv_free(argv);
        return ret;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4fc9be955c71..afc1dc60f3f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -105,7 +105,6 @@ extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 #endif
 extern int pid_max;
-extern int min_free_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -158,14 +157,20 @@ extern int sysctl_tsb_ratio;
 #ifdef __hppa__
 extern int pwrsw_enabled;
+#endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
 extern int unaligned_enabled;
 #endif
 #ifdef CONFIG_IA64
-extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
 #endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
+extern int no_unaligned_warning;
+#endif
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -553,6 +558,8 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
        {
                .procname       = "unaligned-trap",
                .data           = &unaligned_enabled,
@@ -919,7 +926,7 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_doulongvec_minmax,
        },
 #endif
-#ifdef CONFIG_IA64
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
        {
                .procname       = "ignore-unaligned-usertrap",
                .data           = &no_unaligned_warning,
@@ -927,6 +934,8 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#endif
+#ifdef CONFIG_IA64
        {
                .procname       = "unaligned-dump-stack",
                .data           = &unaligned_dump_stack,
@@ -2014,7 +2023,7 @@ static int proc_taint(struct ctl_table *table, int write,
                int i;
                for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
                        if ((tmptaint >> i) & 1)
-                                add_taint(i);
+                                add_taint(i, LOCKDEP_STILL_OK);
                }
        }
@@ -2091,7 +2100,7 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
 static void validate_coredump_safety(void)
 {
 #ifdef CONFIG_COREDUMP
-        if (suid_dumpable == SUID_DUMPABLE_SAFE &&
+        if (suid_dumpable == SUID_DUMP_ROOT &&
            core_pattern[0] != '/' && core_pattern[0] != '|') {
                printk(KERN_WARNING "Unsafe core_pattern used with "\
                        "suid_dumpable=2. Pipe handler or fully qualified "\
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 5a6384450501..ebf72358e86a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
        { CTL_INT,      NET_TCP_MODERATE_RCVBUF,                "tcp_moderate_rcvbuf" },
        { CTL_INT,      NET_TCP_TSO_WIN_DIVISOR,                "tcp_tso_win_divisor" },
        { CTL_STR,      NET_TCP_CONG_CONTROL,                   "tcp_congestion_control" },
-        { CTL_INT,      NET_TCP_ABC,                            "tcp_abc" },
        { CTL_INT,      NET_TCP_MTU_PROBING,                    "tcp_mtu_probing" },
        { CTL_INT,      NET_TCP_BASE_MSS,                       "tcp_base_mss" },
        { CTL_INT,      NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
@@ -971,7 +970,6 @@ out:
 static ssize_t bin_intvec(struct file *file,
        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
-        mm_segment_t old_fs = get_fs();
        ssize_t copied = 0;
        char *buffer;
        ssize_t result;
@@ -984,13 +982,10 @@ static ssize_t bin_intvec(struct file *file,
        if (oldval && oldlen) {
                unsigned __user *vec = oldval;
                size_t length = oldlen / sizeof(*vec);
-                loff_t pos = 0;
                char *str, *end;
                int i;
-                set_fs(KERNEL_DS);
+                result = kernel_read(file, 0, buffer, BUFSZ - 1);
-                result = vfs_read(file, buffer, BUFSZ - 1, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out_kfree;
@@ -1017,7 +1012,6 @@ static ssize_t bin_intvec(struct file *file,
        if (newval && newlen) {
                unsigned __user *vec = newval;
                size_t length = newlen / sizeof(*vec);
-                loff_t pos = 0;
                char *str, *end;
                int i;
@@ -1033,9 +1027,7 @@ static ssize_t bin_intvec(struct file *file,
                        str += snprintf(str, end - str, "%lu\t", value);
                }
-                set_fs(KERNEL_DS);
+                result = kernel_write(file, buffer, str - buffer, 0);
-                result = vfs_write(file, buffer, str - buffer, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out_kfree;
        }
@@ -1049,7 +1041,6 @@ out:
 static ssize_t bin_ulongvec(struct file *file,
        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
-        mm_segment_t old_fs = get_fs();
        ssize_t copied = 0;
        char *buffer;
        ssize_t result;
@@ -1062,13 +1053,10 @@ static ssize_t bin_ulongvec(struct file *file,
        if (oldval && oldlen) {
                unsigned long __user *vec = oldval;
                size_t length = oldlen / sizeof(*vec);
-                loff_t pos = 0;
                char *str, *end;
                int i;
-                set_fs(KERNEL_DS);
+                result = kernel_read(file, 0, buffer, BUFSZ - 1);
-                result = vfs_read(file, buffer, BUFSZ - 1, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out_kfree;
@@ -1095,7 +1083,6 @@ static ssize_t bin_ulongvec(struct file *file,
        if (newval && newlen) {
                unsigned long __user *vec = newval;
                size_t length = newlen / sizeof(*vec);
-                loff_t pos = 0;
                char *str, *end;
                int i;
@@ -1111,9 +1098,7 @@ static ssize_t bin_ulongvec(struct file *file,
                        str += snprintf(str, end - str, "%lu\t", value);
                }
-                set_fs(KERNEL_DS);
+                result = kernel_write(file, buffer, str - buffer, 0);
-                result = vfs_write(file, buffer, str - buffer, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out_kfree;
        }
@@ -1127,19 +1112,15 @@ out:
 static ssize_t bin_uuid(struct file *file,
        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
-        mm_segment_t old_fs = get_fs();
        ssize_t result, copied = 0;
        /* Only supports reads */
        if (oldval && oldlen) {
-                loff_t pos = 0;
                char buf[40], *str = buf;
                unsigned char uuid[16];
                int i;
-                set_fs(KERNEL_DS);
+                result = kernel_read(file, 0, buf, sizeof(buf) - 1);
-                result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out;
@@ -1175,18 +1156,14 @@ out:
 static ssize_t bin_dn_node_address(struct file *file,
        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
-        mm_segment_t old_fs = get_fs();
        ssize_t result, copied = 0;
        if (oldval && oldlen) {
-                loff_t pos = 0;
                char buf[15], *nodep;
                unsigned long area, node;
                __le16 dnaddr;
-                set_fs(KERNEL_DS);
+                result = kernel_read(file, 0, buf, sizeof(buf) - 1);
-                result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out;
@@ -1194,9 +1171,10 @@ static ssize_t bin_dn_node_address(struct file *file,
                /* Convert the decnet address to binary */
                result = -EIO;
-                nodep = strchr(buf, '.') + 1;
+                nodep = strchr(buf, '.');
                if (!nodep)
                        goto out;
+                ++nodep;
                area = simple_strtoul(buf, NULL, 10);
                node = simple_strtoul(nodep, NULL, 10);
@@ -1215,7 +1193,6 @@ static ssize_t bin_dn_node_address(struct file *file,
        }
        if (newval && newlen) {
-                loff_t pos = 0;
                __le16 dnaddr;
                char buf[15];
                int len;
@@ -1232,9 +1209,7 @@ static ssize_t bin_dn_node_address(struct file *file,
                                le16_to_cpu(dnaddr) >> 10,
                                le16_to_cpu(dnaddr) & 0x3ff);
-                set_fs(KERNEL_DS);
+                result = kernel_write(file, buf, len, 0);
-                result = vfs_write(file, buf, len, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out;
        }
diff --git a/kernel/time.c b/kernel/time.c
index c2a27dd93142..f8342a41efa6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -240,7 +240,7 @@ EXPORT_SYMBOL(current_fs_time);
 * Avoid unnecessary multiplications/divisions in the
 * two most common HZ cases:
 */
-inline unsigned int jiffies_to_msecs(const unsigned long j)
+unsigned int jiffies_to_msecs(const unsigned long j)
 {
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
@@ -256,7 +256,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j)
 }
 EXPORT_SYMBOL(jiffies_to_msecs);
-inline unsigned int jiffies_to_usecs(const unsigned long j)
+unsigned int jiffies_to_usecs(const unsigned long j)
 {
 #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
        return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 30b6de0d977c..c6d6400ee137 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -339,6 +339,7 @@ void clockevents_config_and_register(struct clock_event_device *dev,
        clockevents_config(dev, freq);
        clockevents_register_device(dev);
 }
+EXPORT_SYMBOL_GPL(clockevents_config_and_register);
 /**
 * clockevents_update_freq - Update frequency and reprogram a clock event device.
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b10a42bb0165..072bb066bb7d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -23,7 +23,7 @@
 * NTP timekeeping variables:
 */
-DEFINE_SPINLOCK(ntp_lock);
+DEFINE_RAW_SPINLOCK(ntp_lock);
 /* USER_HZ period (usecs): */
@@ -348,7 +348,7 @@ void ntp_clear(void)
 {
        unsigned long flags;
-        spin_lock_irqsave(&ntp_lock, flags);
+        raw_spin_lock_irqsave(&ntp_lock, flags);
        time_adjust     = 0;            /* stop active adjtime() */
        time_status     |= STA_UNSYNC;
@@ -362,7 +362,7 @@ void ntp_clear(void)
        /* Clear PPS state variables */
        pps_clear();
-        spin_unlock_irqrestore(&ntp_lock, flags);
+        raw_spin_unlock_irqrestore(&ntp_lock, flags);
 }
@@ -372,9 +372,9 @@ u64 ntp_tick_length(void)
        unsigned long flags;
        s64 ret;
-        spin_lock_irqsave(&ntp_lock, flags);
+        raw_spin_lock_irqsave(&ntp_lock, flags);
        ret = tick_length;
-        spin_unlock_irqrestore(&ntp_lock, flags);
+        raw_spin_unlock_irqrestore(&ntp_lock, flags);
        return ret;
 }
@@ -395,7 +395,7 @@ int second_overflow(unsigned long secs)
        int leap = 0;
        unsigned long flags;
-        spin_lock_irqsave(&ntp_lock, flags);
+        raw_spin_lock_irqsave(&ntp_lock, flags);
        /*
         * Leap second processing. If in leap-insert state at the end of the
@@ -479,7 +479,7 @@ int second_overflow(unsigned long secs)
        time_adjust = 0;
 out:
-        spin_unlock_irqrestore(&ntp_lock, flags);
+        raw_spin_unlock_irqrestore(&ntp_lock, flags);
        return leap;
 }
@@ -672,7 +672,7 @@ int do_adjtimex(struct timex *txc)
        getnstimeofday(&ts);
-        spin_lock_irq(&ntp_lock);
+        raw_spin_lock_irq(&ntp_lock);
        if (txc->modes & ADJ_ADJTIME) {
                long save_adjust = time_adjust;
@@ -714,7 +714,7 @@ int do_adjtimex(struct timex *txc)
        /* fill PPS status fields */
        pps_fill_timex(txc);
-        spin_unlock_irq(&ntp_lock);
+        raw_spin_unlock_irq(&ntp_lock);
        txc->time.tv_sec = ts.tv_sec;
        txc->time.tv_usec = ts.tv_nsec;
@@ -912,7 +912,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
        pts_norm = pps_normalize_ts(*phase_ts);
-        spin_lock_irqsave(&ntp_lock, flags);
+        raw_spin_lock_irqsave(&ntp_lock, flags);
        /* clear the error bits, they will be set again if needed */
        time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -925,7 +925,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
         * just start the frequency interval */
        if (unlikely(pps_fbase.tv_sec == 0)) {
                pps_fbase = *raw_ts;
-                spin_unlock_irqrestore(&ntp_lock, flags);
+                raw_spin_unlock_irqrestore(&ntp_lock, flags);
                return;
        }
@@ -940,7 +940,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
                time_status |= STA_PPSJITTER;
                /* restart the frequency calibration interval */
                pps_fbase = *raw_ts;
-                spin_unlock_irqrestore(&ntp_lock, flags);
+                raw_spin_unlock_irqrestore(&ntp_lock, flags);
                pr_err("hardpps: PPSJITTER: bad pulse\n");
                return;
        }
@@ -957,7 +957,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
        hardpps_update_phase(pts_norm.nsec);
-        spin_unlock_irqrestore(&ntp_lock, flags);
+        raw_spin_unlock_irqrestore(&ntp_lock, flags);
 }
 EXPORT_SYMBOL(hardpps);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 314b9ee07edf..a19a39952c1b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -554,6 +554,7 @@ void tick_nohz_idle_enter(void)
        local_irq_enable();
 }
+EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
 /**
 * tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -685,6 +686,7 @@ void tick_nohz_idle_exit(void)
        local_irq_enable();
 }
+EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
 static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
 {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1e35515a875e..9a0bc98fbe1d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -138,6 +138,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 }
 /* Timekeeper helper functions. */
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+u32 (*arch_gettimeoffset)(void);
+u32 get_arch_timeoffset(void)
+{
+        if (likely(arch_gettimeoffset))
+                return arch_gettimeoffset();
+        return 0;
+}
+#else
+static inline u32 get_arch_timeoffset(void) { return 0; }
+#endif
 static inline s64 timekeeping_get_ns(struct timekeeper *tk)
 {
        cycle_t cycle_now, cycle_delta;
@@ -154,8 +168,8 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk)
        nsec = cycle_delta * tk->mult + tk->xtime_nsec;
        nsec >>= tk->shift;
-        /* If arch requires, add in gettimeoffset() */
+        /* If arch requires, add in get_arch_timeoffset() */
-        return nsec + arch_gettimeoffset();
+        return nsec + get_arch_timeoffset();
 }
 static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
@@ -174,8 +188,8 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
        /* convert delta to nanoseconds. */
        nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
-        /* If arch requires, add in gettimeoffset() */
+        /* If arch requires, add in get_arch_timeoffset() */
-        return nsec + arch_gettimeoffset();
+        return nsec + get_arch_timeoffset();
 }
 static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
@@ -257,8 +271,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)
        tk->xtime_nsec += cycle_delta * tk->mult;
-        /* If arch requires, add in gettimeoffset() */
+        /* If arch requires, add in get_arch_timeoffset() */
-        tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift;
+        tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift;
        tk_normalize_xtime(tk);
diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc
new file mode 100644
index 000000000000..511bdf2cafda
--- /dev/null
+++ b/kernel/timeconst.bc
@@ -0,0 +1,108 @@
+scale=0
+define gcd(a,b) {
+        auto t;
+        while (b) {
+                t = b;
+                b = a % b;
+                a = t;
+        }
+        return a;
+}
+/* Division by reciprocal multiplication. */
+define fmul(b,n,d) {
+       return (2^b*n+d-1)/d;
+}
+/* Adjustment factor when a ceiling value is used.  Use as:
+   (imul * n) + (fmulxx * n + fadjxx) >> xx) */
+define fadj(b,n,d) {
+        auto v;
+        d = d/gcd(n,d);
+        v = 2^b*(d-1)/d;
+        return v;
+}
+/* Compute the appropriate mul/adj values as well as a shift count,
+   which brings the mul value into the range 2^b-1 <= x < 2^b.  Such
+   a shift value will be correct in the signed integer range and off
+   by at most one in the upper half of the unsigned range. */
+define fmuls(b,n,d) {
+        auto s, m;
+        for (s = 0; 1; s++) {
+                m = fmul(s,n,d);
+                if (m >= 2^(b-1))
+                        return s;
+        }
+        return 0;
+}
+define timeconst(hz) {
+        print "/* Automatically generated by kernel/timeconst.bc */\n"
+        print "/* Time conversion constants for HZ == ", hz, " */\n"
+        print "\n"
+        print "#ifndef KERNEL_TIMECONST_H\n"
+        print "#define KERNEL_TIMECONST_H\n\n"
+        print "#include <linux/param.h>\n"
+        print "#include <linux/types.h>\n\n"
+        print "#if HZ != ", hz, "\n"
+        print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
+        print "#endif\n\n"
+        if (hz < 2) {
+                print "#error Totally bogus HZ value!\n"
+        } else {
+                s=fmuls(32,1000,hz)
+                obase=16
+                print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n"
+                print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n"
+                obase=10
+                print "#define HZ_TO_MSEC_SHR32\t", s, "\n"
+                s=fmuls(32,hz,1000)
+                obase=16
+                print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n"
+                print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n"
+                obase=10
+                print "#define MSEC_TO_HZ_SHR32\t", s, "\n"
+                obase=10
+                cd=gcd(hz,1000)
+                print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n"
+                print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n"
+                print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+                print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n"
+                print "\n"
+                s=fmuls(32,1000000,hz)
+                obase=16
+                print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n"
+                print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n"
+                obase=10
+                print "#define HZ_TO_USEC_SHR32\t", s, "\n"
+                s=fmuls(32,hz,1000000)
+                obase=16
+                print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n"
+                print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n"
+                obase=10
+                print "#define USEC_TO_HZ_SHR32\t", s, "\n"
+                obase=10
+                cd=gcd(hz,1000000)
+                print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n"
+                print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
+                print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+                print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
+                print "\n"
+                print "#endif /* KERNEL_TIMECONST_H */\n"
+        }
+        halt
+}
+timeconst(hz)
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
deleted file mode 100644
index 3f42652a6a37..000000000000
--- a/kernel/timeconst.pl
+++ /dev/null
@@ -1,376 +0,0 @@
-#!/usr/bin/perl
-# -----------------------------------------------------------------------
-#
-#   Copyright 2007-2008 rPath, Inc. - All Rights Reserved
-#
-#   This file is part of the Linux kernel, and is made available under
-#   the terms of the GNU General Public License version 2 or (at your
-#   option) any later version; incorporated herein by reference.
-#
-# -----------------------------------------------------------------------
-#
-#
-# Usage: timeconst.pl HZ > timeconst.h
-#
-# Precomputed values for systems without Math::BigInt
-# Generated by:
-# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
-%canned_values = (
-        24 => [
-                '0xa6aaaaab','0x2aaaaaa',26,
-                125,3,
-                '0xc49ba5e4','0x1fbe76c8b4',37,
-                3,125,
-                '0xa2c2aaab','0xaaaa',16,
-                125000,3,
-                '0xc9539b89','0x7fffbce4217d',47,
-                3,125000,
-        ], 32 => [
-                '0xfa000000','0x6000000',27,
-                125,4,
-                '0x83126e98','0xfdf3b645a',36,
-                4,125,
-                '0xf4240000','0x0',17,
-                31250,1,
-                '0x8637bd06','0x3fff79c842fa',46,
-                1,31250,
-        ], 48 => [
-                '0xa6aaaaab','0x6aaaaaa',27,
-                125,6,
-                '0xc49ba5e4','0xfdf3b645a',36,
-                6,125,
-                '0xa2c2aaab','0x15555',17,
-                62500,3,
-                '0xc9539b89','0x3fffbce4217d',46,
-                3,62500,
-        ], 64 => [
-                '0xfa000000','0xe000000',28,
-                125,8,
-                '0x83126e98','0x7ef9db22d',35,
-                8,125,
-                '0xf4240000','0x0',18,
-                15625,1,
-                '0x8637bd06','0x1fff79c842fa',45,
-                1,15625,
-        ], 100 => [
-                '0xa0000000','0x0',28,
-                10,1,
-                '0xcccccccd','0x733333333',35,
-                1,10,
-                '0x9c400000','0x0',18,
-                10000,1,
-                '0xd1b71759','0x1fff2e48e8a7',45,
-                1,10000,
-        ], 122 => [
-                '0x8325c53f','0xfbcda3a',28,
-                500,61,
-                '0xf9db22d1','0x7fbe76c8b',35,
-                61,500,
-                '0x8012e2a0','0x3ef36',18,
-                500000,61,
-                '0xffda4053','0x1ffffbce4217',45,
-                61,500000,
-        ], 128 => [
-                '0xfa000000','0x1e000000',29,
-                125,16,
-                '0x83126e98','0x3f7ced916',34,
-                16,125,
-                '0xf4240000','0x40000',19,
-                15625,2,
-                '0x8637bd06','0xfffbce4217d',44,
-                2,15625,
-        ], 200 => [
-                '0xa0000000','0x0',29,
-                5,1,
-                '0xcccccccd','0x333333333',34,
-                1,5,
-                '0x9c400000','0x0',19,
-                5000,1,
-                '0xd1b71759','0xfff2e48e8a7',44,
-                1,5000,
-        ], 250 => [
-                '0x80000000','0x0',29,
-                4,1,
-                '0x80000000','0x180000000',33,
-                1,4,
-                '0xfa000000','0x0',20,
-                4000,1,
-                '0x83126e98','0x7ff7ced9168',43,
-                1,4000,
-        ], 256 => [
-                '0xfa000000','0x3e000000',30,
-                125,32,
-                '0x83126e98','0x1fbe76c8b',33,
-                32,125,
-                '0xf4240000','0xc0000',20,
-                15625,4,
-                '0x8637bd06','0x7ffde7210be',43,
-                4,15625,
-        ], 300 => [
-                '0xd5555556','0x2aaaaaaa',30,
-                10,3,
-                '0x9999999a','0x1cccccccc',33,
-                3,10,
-                '0xd0555556','0xaaaaa',20,
-                10000,3,
-                '0x9d495183','0x7ffcb923a29',43,
-                3,10000,
-        ], 512 => [
-                '0xfa000000','0x7e000000',31,
-                125,64,
-                '0x83126e98','0xfdf3b645',32,
-                64,125,
-                '0xf4240000','0x1c0000',21,
-                15625,8,
-                '0x8637bd06','0x3ffef39085f',42,
-                8,15625,
-        ], 1000 => [
-                '0x80000000','0x0',31,
-                1,1,
-                '0x80000000','0x0',31,
-                1,1,
-                '0xfa000000','0x0',22,
-                1000,1,
-                '0x83126e98','0x1ff7ced9168',41,
-                1,1000,
-        ], 1024 => [
-                '0xfa000000','0xfe000000',32,
-                125,128,
-                '0x83126e98','0x7ef9db22',31,
-                128,125,
-                '0xf4240000','0x3c0000',22,
-                15625,16,
-                '0x8637bd06','0x1fff79c842f',41,
-                16,15625,
-        ], 1200 => [
-                '0xd5555556','0xd5555555',32,
-                5,6,
-                '0x9999999a','0x66666666',31,
-                6,5,
-                '0xd0555556','0x2aaaaa',22,
-                2500,3,
-                '0x9d495183','0x1ffcb923a29',41,
-                3,2500,
-        ]
-);
-$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
-sub bint($)
-{
-        my($x) = @_;
-        return Math::BigInt->new($x);
-}
-#
-# Constants for division by reciprocal multiplication.
-# (bits, numerator, denominator)
-#
-sub fmul($$$)
-{
-        my ($b,$n,$d) = @_;
-        $n = bint($n);
-        $d = bint($d);
-        return scalar (($n << $b)+$d-bint(1))/$d;
-}
-sub fadj($$$)
-{
-        my($b,$n,$d) = @_;
-        $n = bint($n);
-        $d = bint($d);
-        $d = $d/bgcd($n, $d);
-        return scalar (($d-bint(1)) << $b)/$d;
-}
-sub fmuls($$$) {
-        my($b,$n,$d) = @_;
-        my($s,$m);
-        my($thres) = bint(1) << ($b-1);
-        $n = bint($n);
-        $d = bint($d);
-        for ($s = 0; 1; $s++) {
-                $m = fmul($s,$n,$d);
-                return $s if ($m >= $thres);
-        }
-        return 0;
-}
-# Generate a hex value if the result fits in 64 bits;
-# otherwise skip.
-sub bignum_hex($) {
-        my($x) = @_;
-        my $s = $x->as_hex();
-        return (length($s) > 18) ? undef : $s;
-}
-# Provides mul, adj, and shr factors for a specific
-# (bit, time, hz) combination
-sub muladj($$$) {
-        my($b, $t, $hz) = @_;
-        my $s = fmuls($b, $t, $hz);
-        my $m = fmul($s, $t, $hz);
-        my $a = fadj($s, $t, $hz);
-        return (bignum_hex($m), bignum_hex($a), $s);
-}
-# Provides numerator, denominator values
-sub numden($$) {
-        my($n, $d) = @_;
-        my $g = bgcd($n, $d);
-        return ($n/$g, $d/$g);
-}
-# All values for a specific (time, hz) combo
-sub conversions($$) {
-        my ($t, $hz) = @_;
-        my @val = ();
-        # HZ_TO_xx
-        push(@val, muladj(32, $t, $hz));
-        push(@val, numden($t, $hz));
-        # xx_TO_HZ
-        push(@val, muladj(32, $hz, $t));
-        push(@val, numden($hz, $t));
-        return @val;
-}
-sub compute_values($) {
-        my($hz) = @_;
-        my @val = ();
-        my $s, $m, $a, $g;
-        if (!$has_bigint) {
-                die "$0: HZ == $hz not canned and ".
-                    "Math::BigInt not available\n";
-        }
-        # MSEC conversions
-        push(@val, conversions(1000, $hz));
-        # USEC conversions
-        push(@val, conversions(1000000, $hz));
-        return @val;
-}
-sub outputval($$)
-{
-        my($name, $val) = @_;
-        my $csuf;
-        if (defined($val)) {
-            if ($name !~ /SHR/) {
-                $val = "U64_C($val)";
-            }
-            printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
-        }
-}
-sub output($@)
-{
-        my($hz, @val) = @_;
-        my $pfx, $bit, $suf, $s, $m, $a;
-        print "/* Automatically generated by kernel/timeconst.pl */\n";
-        print "/* Conversion constants for HZ == $hz */\n";
-        print "\n";
-        print "#ifndef KERNEL_TIMECONST_H\n";
-        print "#define KERNEL_TIMECONST_H\n";
-        print "\n";
-        print "#include <linux/param.h>\n";
-        print "#include <linux/types.h>\n";
-        print "\n";
-        print "#if HZ != $hz\n";
-        print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
-        print "#endif\n";
-        print "\n";
-        foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
-                      'HZ_TO_USEC','USEC_TO_HZ') {
-                foreach $bit (32) {
-                        foreach $suf ('MUL', 'ADJ', 'SHR') {
-                                outputval("${pfx}_$suf$bit", shift(@val));
-                        }
-                }
-                foreach $suf ('NUM', 'DEN') {
-                        outputval("${pfx}_$suf", shift(@val));
-                }
-        }
-        print "\n";
-        print "#endif /* KERNEL_TIMECONST_H */\n";
-}
-# Pretty-print Perl values
-sub perlvals(@) {
-        my $v;
-        my @l = ();
-        foreach $v (@_) {
-                if (!defined($v)) {
-                        push(@l, 'undef');
-                } elsif ($v =~ /^0x/) {
-                        push(@l, "\'".$v."\'");
-                } else {
-                        push(@l, $v.'');
-                }
-        }
-        return join(',', @l);
-}
-($hz) = @ARGV;
-# Use this to generate the %canned_values structure
-if ($hz eq '--can') {
-        shift(@ARGV);
-        @hzlist = sort {$a <=> $b} (@ARGV);
-        print "# Precomputed values for systems without Math::BigInt\n";
-        print "# Generated by:\n";
-        print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
-        print "\%canned_values = (\n";
-        my $pf = "\t";
-        foreach $hz (@hzlist) {
-                my @values = compute_values($hz);
-                print "$pf$hz => [\n";
-                while (scalar(@values)) {
-                        my $bit;
-                        foreach $bit (32) {
-                                my $m = shift(@values);
-                                my $a = shift(@values);
-                                my $s = shift(@values);
-                                print "\t\t", perlvals($m,$a,$s), ",\n";
-                        }
-                        my $n = shift(@values);
-                        my $d = shift(@values);
-                        print "\t\t", perlvals($n,$d), ",\n";
-                }
-                print "\t]";
-                $pf = ', ';
-        }
-        print "\n);\n";
-} else {
-        $hz += 0;                       # Force to number
-        if ($hz < 1) {
-                die "Usage: $0 HZ\n";
-        }
-        $cv = $canned_values{$hz};
-        @val = defined($cv) ? @$cv : compute_values($hz);
-        output($hz, @val);
-}
-exit 0;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b516a8e19d51..fc382d6e2765 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -81,21 +81,6 @@ config EVENT_TRACING
        select CONTEXT_SWITCH_TRACER
        bool
-config EVENT_POWER_TRACING_DEPRECATED
-        depends on EVENT_TRACING
-        bool "Deprecated power event trace API, to be removed"
-        default y
-        help
-          Provides old power event types:
-          C-state/idle accounting events:
-          power:power_start
-          power:power_end
-          and old cpufreq accounting event:
-          power:power_frequency
-          This is for userspace compatibility
-          and will vanish after 5 kernel iterations,
-          namely 3.1.
 config CONTEXT_SWITCH_TRACER
        bool
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 71259e2b6b61..9e5b8c272eec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -739,6 +739,12 @@ static void blk_add_trace_rq_complete(void *ignore,
                                      struct request_queue *q,
                                      struct request *rq)
 {
+        struct blk_trace *bt = q->blk_trace;
+        /* if control ever passes through here, it's a request based driver */
+        if (unlikely(bt && !bt->rq_based))
+                bt->rq_based = true;
        blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
 }
@@ -774,15 +780,30 @@ static void blk_add_trace_bio_bounce(void *ignore,
        blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
 }
-static void blk_add_trace_bio_complete(void *ignore,
+static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
-                                       struct request_queue *q, struct bio *bio,
-                                       int error)
 {
+        struct request_queue *q;
+        struct blk_trace *bt;
+        if (!bio->bi_bdev)
+                return;
+        q = bdev_get_queue(bio->bi_bdev);
+        bt = q->blk_trace;
+        /*
+         * Request based drivers will generate both rq and bio completions.
+         * Ignore bio ones.
+         */
+        if (likely(!bt) || bt->rq_based)
+                return;
        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
 }
 static void blk_add_trace_bio_backmerge(void *ignore,
                                        struct request_queue *q,
+                                        struct request *rq,
                                        struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
@@ -790,6 +811,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
 static void blk_add_trace_bio_frontmerge(void *ignore,
                                         struct request_queue *q,
+                                         struct request *rq,
                                         struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e6effd0c40a9..6893d5a2bf08 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -762,7 +762,6 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
 {
        struct ftrace_profile *rec;
        struct hlist_head *hhd;
-        struct hlist_node *n;
        unsigned long key;
        key = hash_long(ip, ftrace_profile_bits);
@@ -771,7 +770,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
        if (hlist_empty(hhd))
                return NULL;
-        hlist_for_each_entry_rcu(rec, n, hhd, node) {
+        hlist_for_each_entry_rcu(rec, hhd, node) {
                if (rec->ip == ip)
                        return rec;
        }
@@ -1133,7 +1132,6 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
        unsigned long key;
        struct ftrace_func_entry *entry;
        struct hlist_head *hhd;
-        struct hlist_node *n;
        if (ftrace_hash_empty(hash))
                return NULL;
@@ -1145,7 +1143,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
        hhd = &hash->buckets[key];
-        hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
+        hlist_for_each_entry_rcu(entry, hhd, hlist) {
                if (entry->ip == ip)
                        return entry;
        }
@@ -1202,7 +1200,7 @@ remove_hash_entry(struct ftrace_hash *hash,
 static void ftrace_hash_clear(struct ftrace_hash *hash)
 {
        struct hlist_head *hhd;
-        struct hlist_node *tp, *tn;
+        struct hlist_node *tn;
        struct ftrace_func_entry *entry;
        int size = 1 << hash->size_bits;
        int i;
@@ -1212,7 +1210,7 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
        for (i = 0; i < size; i++) {
                hhd = &hash->buckets[i];
-                hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
+                hlist_for_each_entry_safe(entry, tn, hhd, hlist)
                        free_hash_entry(hash, entry);
        }
        FTRACE_WARN_ON(hash->count);
@@ -1275,7 +1273,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
 {
        struct ftrace_func_entry *entry;
        struct ftrace_hash *new_hash;
-        struct hlist_node *tp;
        int size;
        int ret;
        int i;
@@ -1290,7 +1287,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
        size = 1 << hash->size_bits;
        for (i = 0; i < size; i++) {
-                hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
+                hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
                        ret = add_hash_entry(new_hash, entry->ip);
                        if (ret < 0)
                                goto free_hash;
@@ -1316,7 +1313,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
                 struct ftrace_hash **dst, struct ftrace_hash *src)
 {
        struct ftrace_func_entry *entry;
-        struct hlist_node *tp, *tn;
+        struct hlist_node *tn;
        struct hlist_head *hhd;
        struct ftrace_hash *old_hash;
        struct ftrace_hash *new_hash;
@@ -1362,7 +1359,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
        size = 1 << src->size_bits;
        for (i = 0; i < size; i++) {
                hhd = &src->buckets[i];
-                hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
+                hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
                        if (bits > 0)
                                key = hash_long(entry->ip, bits);
                        else
@@ -2901,7 +2898,6 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
 {
        struct ftrace_func_probe *entry;
        struct hlist_head *hhd;
-        struct hlist_node *n;
        unsigned long key;
        key = hash_long(ip, FTRACE_HASH_BITS);
@@ -2917,7 +2913,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
         * on the hash. rcu_read_lock is too dangerous here.
         */
        preempt_disable_notrace();
-        hlist_for_each_entry_rcu(entry, n, hhd, node) {
+        hlist_for_each_entry_rcu(entry, hhd, node) {
                if (entry->ip == ip)
                        entry->ops->func(ip, parent_ip, &entry->data);
        }
@@ -3068,7 +3064,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                                  void *data, int flags)
 {
        struct ftrace_func_probe *entry;
-        struct hlist_node *n, *tmp;
+        struct hlist_node *tmp;
        char str[KSYM_SYMBOL_LEN];
        int type = MATCH_FULL;
        int i, len = 0;
@@ -3091,7 +3087,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
                struct hlist_head *hhd = &ftrace_func_hash[i];
-                hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
+                hlist_for_each_entry_safe(entry, tmp, hhd, node) {
                        /* break up if statements for readability */
                        if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index f55fcf61b223..1c71382b283d 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,8 +13,5 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
-#ifdef EVENT_POWER_TRACING_DEPRECATED
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
-#endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7244acde77b0..6989df2ba194 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -178,7 +178,7 @@ void tracing_off_permanent(void)
 #define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 #define RB_EVNT_MIN_SIZE        8U      /* two 32bit words */
-#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
 # define RB_FORCE_8BYTE_ALIGNMENT       0
 # define RB_ARCH_ALIGNMENT              RB_ALIGNMENT
 #else
@@ -186,6 +186,8 @@ void tracing_off_permanent(void)
 # define RB_ARCH_ALIGNMENT              8U
 #endif
+#define RB_ALIGN_DATA           __aligned(RB_ARCH_ALIGNMENT)
 /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
 #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -334,7 +336,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 struct buffer_data_page {
        u64              time_stamp;    /* page time stamp */
        local_t          commit;        /* write committed index */
-        unsigned char    data[];        /* data of buffer page */
+        unsigned char    data[] RB_ALIGN_DATA;  /* data of buffer page */
 };
 /*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 194d79602dc7..697e88d13907 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -739,12 +739,11 @@ static int task_state_char(unsigned long state)
 struct trace_event *ftrace_find_event(int type)
 {
        struct trace_event *event;
-        struct hlist_node *n;
        unsigned key;
        key = type & (EVENT_HASHSIZE - 1);
-        hlist_for_each_entry(event, n, &event_hash[key], node) {
+        hlist_for_each_entry(event, &event_hash[key], node) {
                if (event->type == type)
                        return event;
        }
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d96ba22dabfa..0c05a4592047 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -192,12 +192,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
 static struct tracepoint_entry *get_tracepoint(const char *name)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct tracepoint_entry *e;
        u32 hash = jhash(name, strlen(name), 0);
        head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
-        hlist_for_each_entry(e, node, head, hlist) {
+        hlist_for_each_entry(e, head, hlist) {
                if (!strcmp(name, e->name))
                        return e;
        }
@@ -211,13 +210,12 @@ static struct tracepoint_entry *get_tracepoint(const char *name)
 static struct tracepoint_entry *add_tracepoint(const char *name)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct tracepoint_entry *e;
        size_t name_len = strlen(name) + 1;
        u32 hash = jhash(name, name_len-1, 0);
        head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
-        hlist_for_each_entry(e, node, head, hlist) {
+        hlist_for_each_entry(e, head, hlist) {
                if (!strcmp(name, e->name)) {
                        printk(KERN_NOTICE
                                "tracepoint %s busy\n", name);
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 1744bb80f1fb..394f70b17162 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -34,11 +34,11 @@ EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
 void fire_user_return_notifiers(void)
 {
        struct user_return_notifier *urn;
-        struct hlist_node *tmp1, *tmp2;
+        struct hlist_node *tmp2;
        struct hlist_head *head;
        head = &get_cpu_var(return_notifier_list);
-        hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
+        hlist_for_each_entry_safe(urn, tmp2, head, link)
                urn->on_user_return(urn);
        put_cpu_var(return_notifier_list);
 }
diff --git a/kernel/user.c b/kernel/user.c
index 33acb5e53a5f..e81978e8c03b 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -47,9 +47,7 @@ struct user_namespace init_user_ns = {
                        .count = 4294967295U,
                },
        },
-        .kref = {
+        .count = ATOMIC_INIT(3),
-                .refcount       = ATOMIC_INIT(3),
-        },
        .owner = GLOBAL_ROOT_UID,
        .group = GLOBAL_ROOT_GID,
        .proc_inum = PROC_USER_INIT_INO,
@@ -107,9 +105,8 @@ static void uid_hash_remove(struct user_struct *up)
 static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
 {
        struct user_struct *user;
-        struct hlist_node *h;
-        hlist_for_each_entry(user, h, hashent, uidhash_node) {
+        hlist_for_each_entry(user, hashent, uidhash_node) {
                if (uid_eq(user->uid, uid)) {
                        atomic_inc(&user->__count);
                        return user;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2b042c42fbc4..b14f4d342043 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -21,6 +21,7 @@
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/projid.h>
+#include <linux/fs_struct.h>
 static struct kmem_cache *user_ns_cachep __read_mostly;
@@ -78,7 +79,7 @@ int create_user_ns(struct cred *new)
                return ret;
        }
-        kref_init(&ns->kref);
+        atomic_set(&ns->count, 1);
        /* Leave the new->user_ns reference with the new user namespace. */
        ns->parent = parent_ns;
        ns->owner = owner;
@@ -104,15 +105,16 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
        return create_user_ns(cred);
 }
-void free_user_ns(struct kref *kref)
+void free_user_ns(struct user_namespace *ns)
 {
-        struct user_namespace *parent, *ns =
+        struct user_namespace *parent;
-                container_of(kref, struct user_namespace, kref);
-        parent = ns->parent;
+        do {
-        proc_free_inum(ns->proc_inum);
+                parent = ns->parent;
-        kmem_cache_free(user_ns_cachep, ns);
+                proc_free_inum(ns->proc_inum);
-        put_user_ns(parent);
+                kmem_cache_free(user_ns_cachep, ns);
+                ns = parent;
+        } while (atomic_dec_and_test(&parent->count));
 }
 EXPORT_SYMBOL(free_user_ns);
@@ -519,6 +521,42 @@ struct seq_operations proc_projid_seq_operations = {
        .show = projid_m_show,
 };
+static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent)
+{
+        u32 upper_first, lower_first, upper_last, lower_last;
+        unsigned idx;
+        upper_first = extent->first;
+        lower_first = extent->lower_first;
+        upper_last = upper_first + extent->count - 1;
+        lower_last = lower_first + extent->count - 1;
+        for (idx = 0; idx < new_map->nr_extents; idx++) {
+                u32 prev_upper_first, prev_lower_first;
+                u32 prev_upper_last, prev_lower_last;
+                struct uid_gid_extent *prev;
+                prev = &new_map->extent[idx];
+                prev_upper_first = prev->first;
+                prev_lower_first = prev->lower_first;
+                prev_upper_last = prev_upper_first + prev->count - 1;
+                prev_lower_last = prev_lower_first + prev->count - 1;
+                /* Does the upper range intersect a previous extent? */
+                if ((prev_upper_first <= upper_last) &&
+                    (prev_upper_last >= upper_first))
+                        return true;
+                /* Does the lower range intersect a previous extent? */
+                if ((prev_lower_first <= lower_last) &&
+                    (prev_lower_last >= lower_first))
+                        return true;
+        }
+        return false;
+}
 static DEFINE_MUTEX(id_map_mutex);
 static ssize_t map_write(struct file *file, const char __user *buf,
@@ -531,7 +569,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
        struct user_namespace *ns = seq->private;
        struct uid_gid_map new_map;
        unsigned idx;
-        struct uid_gid_extent *extent, *last = NULL;
+        struct uid_gid_extent *extent = NULL;
        unsigned long page = 0;
        char *kbuf, *pos, *next_line;
        ssize_t ret = -EINVAL;
@@ -634,14 +672,11 @@ static ssize_t map_write(struct file *file, const char __user *buf,
                if ((extent->lower_first + extent->count) <= extent->lower_first)
                        goto out;
-                /* For now only accept extents that are strictly in order */
+                /* Do the ranges in extent overlap any previous extents? */
-                if (last &&
+                if (mappings_overlap(&new_map, extent))
-                    (((last->first + last->count) > extent->first) ||
-                     ((last->lower_first + last->count) > extent->lower_first)))
                        goto out;
                new_map.nr_extents++;
-                last = extent;
                /* Fail if the file contains too many extents */
                if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
@@ -803,6 +838,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
        if (atomic_read(&current->mm->mm_users) > 1)
                return -EINVAL;
+        if (current->fs->users != 1)
+                return -EINVAL;
        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 08b197e8c485..a47fc5de3113 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -30,7 +30,7 @@ static struct uts_namespace *create_uts_ns(void)
 /*
 * Clone a new ns copying an original utsname, setting refcount to 1
 * @old_ns: namespace to clone
- * Return NULL on error (failure to kmalloc), new ns otherwise
+ * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
 */
 static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
                                          struct uts_namespace *old_ns)
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 63da38c2d820..4f69f9a5e221 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -15,6 +15,8 @@
 #include <linux/sysctl.h>
 #include <linux/wait.h>
+#ifdef CONFIG_PROC_SYSCTL
 static void *get_uts(ctl_table *table, int write)
 {
        char *which = table->data;
@@ -38,7 +40,6 @@ static void put_uts(ctl_table *table, int write, void *which)
                up_write(&uts_sem);
 }
-#ifdef CONFIG_PROC_SYSCTL
 /*
 *      Special case of dostring for the UTS structure. This has locks
 *      to observe. Should this be in kernel/sys.c ????
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 27689422aa92..4a944676358e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -113,9 +113,9 @@ static int get_softlockup_thresh(void)
 * resolution, and we don't need to waste time with a big divide when
 * 2^30ns == 1.074s.
 */
-static unsigned long get_timestamp(int this_cpu)
+static unsigned long get_timestamp(void)
 {
-        return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+        return local_clock() >> 30LL;  /* 2^30 ~= 10^9 */
 }
 static void set_sample_period(void)
@@ -133,9 +133,7 @@ static void set_sample_period(void)
 /* Commands for resetting the watchdog */
 static void __touch_watchdog(void)
 {
-        int this_cpu = smp_processor_id();
+        __this_cpu_write(watchdog_touch_ts, get_timestamp());
-        __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
 }
 void touch_softlockup_watchdog(void)
@@ -196,7 +194,7 @@ static int is_hardlockup(void)
 static int is_softlockup(unsigned long touch_ts)
 {
-        unsigned long now = get_timestamp(smp_processor_id());
+        unsigned long now = get_timestamp();
        /* Warn about unreasonable delays: */
        if (time_after(now, touch_ts + get_softlockup_thresh()))
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f4feacad3812..55fac5b991b7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -251,8 +251,8 @@ EXPORT_SYMBOL_GPL(system_freezable_wq);
        for ((pool) = &std_worker_pools(cpu)[0];                        \
             (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
-#define for_each_busy_worker(worker, i, pos, pool)                      \
+#define for_each_busy_worker(worker, i, pool)                           \
-        hash_for_each(pool->busy_hash, i, pos, worker, hentry)
+        hash_for_each(pool->busy_hash, i, worker, hentry)
 static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
                                unsigned int sw)
@@ -457,11 +457,12 @@ static int worker_pool_assign_id(struct worker_pool *pool)
        int ret;
        mutex_lock(&worker_pool_idr_mutex);
-        idr_pre_get(&worker_pool_idr, GFP_KERNEL);
+        ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
-        ret = idr_get_new(&worker_pool_idr, pool, &pool->id);
+        if (ret >= 0)
+                pool->id = ret;
        mutex_unlock(&worker_pool_idr_mutex);
-        return ret;
+        return ret < 0 ? ret : 0;
 }
 /*
@@ -909,9 +910,8 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
                                                 struct work_struct *work)
 {
        struct worker *worker;
-        struct hlist_node *tmp;
-        hash_for_each_possible(pool->busy_hash, worker, tmp, hentry,
+        hash_for_each_possible(pool->busy_hash, worker, hentry,
                               (unsigned long)work)
                if (worker->current_work == work &&
                    worker->current_func == work->func)
@@ -1626,7 +1626,6 @@ static void busy_worker_rebind_fn(struct work_struct *work)
 static void rebind_workers(struct worker_pool *pool)
 {
        struct worker *worker, *n;
-        struct hlist_node *pos;
        int i;
        lockdep_assert_held(&pool->assoc_mutex);
@@ -1648,7 +1647,7 @@ static void rebind_workers(struct worker_pool *pool)
        }
        /* rebind busy workers */
-        for_each_busy_worker(worker, i, pos, pool) {
+        for_each_busy_worker(worker, i, pool) {
                struct work_struct *rebind_work = &worker->rebind_work;
                struct workqueue_struct *wq;
@@ -3423,7 +3422,6 @@ static void wq_unbind_fn(struct work_struct *work)
        int cpu = smp_processor_id();
        struct worker_pool *pool;
        struct worker *worker;
-        struct hlist_node *pos;
        int i;
        for_each_std_worker_pool(pool, cpu) {
@@ -3442,7 +3440,7 @@ static void wq_unbind_fn(struct work_struct *work)
                list_for_each_entry(worker, &pool->idle_list, entry)
                        worker->flags |= WORKER_UNBOUND;
-                for_each_busy_worker(worker, i, pos, pool)
+                for_each_busy_worker(worker, i, pool)
                        worker->flags |= WORKER_UNBOUND;
                pool->flags |= POOL_DISASSOCIATED;