39 files changed, 819 insertions, 599 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c5f081132a4..188c43223f52 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,7 +11,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o
-obj-$(CONFIG_SYSCTL) += sysctl_check.o
+obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 28fef6bf8534..13430176b3c9 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -272,7 +272,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
                return -EINVAL;
        watch = audit_init_watch(path);
-        if (unlikely(IS_ERR(watch)))
+        if (IS_ERR(watch))
                return PTR_ERR(watch);
        audit_get_watch(watch);
@@ -848,7 +848,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
                return ERR_PTR(-ENOMEM);
        new = audit_init_watch(path);
-        if (unlikely(IS_ERR(new))) {
+        if (IS_ERR(new)) {
                kfree(path);
                goto out;
        }
@@ -989,7 +989,7 @@ static void audit_update_watch(struct audit_parent *parent,
                        audit_set_auditable(current->audit_context);
                nwatch = audit_dupe_watch(owatch);
-                if (unlikely(IS_ERR(nwatch))) {
+                if (IS_ERR(nwatch)) {
                        mutex_unlock(&audit_filter_mutex);
                        audit_panic("error updating watch, skipping");
                        return;
@@ -1004,7 +1004,7 @@ static void audit_update_watch(struct audit_parent *parent,
                        list_del_rcu(&oentry->list);
                        nentry = audit_dupe_rule(&oentry->rule, nwatch);
-                        if (unlikely(IS_ERR(nentry)))
+                        if (IS_ERR(nentry))
                                audit_panic("error updating watch, removing");
                        else {
                                int h = audit_hash_ino((u32)ino);
@@ -1785,7 +1785,7 @@ int audit_update_lsm_rules(void)
                        watch = entry->rule.watch;
                        tree = entry->rule.tree;
                        nentry = audit_dupe_rule(&entry->rule, watch);
-                        if (unlikely(IS_ERR(nentry))) {
+                        if (IS_ERR(nentry)) {
                                /* save the first error encountered for the
                                 * return value */
                                if (!err)
diff --git a/kernel/bounds.c b/kernel/bounds.c
index c3c55544db2f..3c5301381837 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -8,11 +8,7 @@
 /* Include headers that define the enum constants of interest */
 #include <linux/page-flags.h>
 #include <linux/mmzone.h>
+#include <linux/kbuild.h>
-#define DEFINE(sym, val) \
-        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
-#define BLANK() asm volatile("\n->" : : )
 void foo(void)
 {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6d8de051382b..b9d467d83fc1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -44,6 +44,7 @@
 #include <linux/kmod.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
+#include <linux/hash.h>
 #include <asm/atomic.h>
@@ -118,17 +119,7 @@ static int root_count;
 * be called.
 */
 static int need_forkexit_callback;
+static int need_mm_owner_callback __read_mostly;
-/* bits in struct cgroup flags field */
-enum {
-        /* Control Group is dead */
-        CGRP_REMOVED,
-        /* Control Group has previously had a child cgroup or a task,
-         * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
-        CGRP_RELEASABLE,
-        /* Control Group requires release notifications to userspace */
-        CGRP_NOTIFY_ON_RELEASE,
-};
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -204,6 +195,27 @@ static struct cg_cgroup_link init_css_set_link;
 static DEFINE_RWLOCK(css_set_lock);
 static int css_set_count;
+/* hash table for cgroup groups. This improves the performance to
+ * find an existing css_set */
+#define CSS_SET_HASH_BITS       7
+#define CSS_SET_TABLE_SIZE      (1 << CSS_SET_HASH_BITS)
+static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+{
+        int i;
+        int index;
+        unsigned long tmp = 0UL;
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+                tmp += (unsigned long)css[i];
+        tmp = (tmp >> 16) ^ tmp;
+        index = hash_long(tmp, CSS_SET_HASH_BITS);
+        return &css_set_table[index];
+}
 /* We don't maintain the lists running through each css_set to its
 * task until after the first call to cgroup_iter_start(). This
 * reduces the fork()/exit() overhead for people who have cgroups
@@ -230,7 +242,7 @@ static int use_task_css_set_links;
 static void unlink_css_set(struct css_set *cg)
 {
        write_lock(&css_set_lock);
-        list_del(&cg->list);
+        hlist_del(&cg->hlist);
        css_set_count--;
        while (!list_empty(&cg->cg_links)) {
                struct cg_cgroup_link *link;
@@ -295,9 +307,7 @@ static inline void put_css_set_taskexit(struct css_set *cg)
 /*
 * find_existing_css_set() is a helper for
 * find_css_set(), and checks to see whether an existing
- * css_set is suitable. This currently walks a linked-list for
+ * css_set is suitable.
- * simplicity; a later patch will use a hash table for better
- * performance
 *
 * oldcg: the cgroup group that we're using before the cgroup
 * transition
@@ -314,7 +324,9 @@ static struct css_set *find_existing_css_set(
 {
        int i;
        struct cgroupfs_root *root = cgrp->root;
-        struct list_head *l = &init_css_set.list;
+        struct hlist_head *hhead;
+        struct hlist_node *node;
+        struct css_set *cg;
        /* Built the set of subsystem state objects that we want to
         * see in the new css_set */
@@ -331,18 +343,13 @@ static struct css_set *find_existing_css_set(
                }
        }
-        /* Look through existing cgroup groups to find one to reuse */
+        hhead = css_set_hash(template);
-        do {
+        hlist_for_each_entry(cg, node, hhead, hlist) {
-                struct css_set *cg =
-                        list_entry(l, struct css_set, list);
                if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
                        /* All subsystems matched */
                        return cg;
                }
-                /* Try the next cgroup group */
+        }
-                l = l->next;
-        } while (l != &init_css_set.list);
        /* No existing cgroup group matched */
        return NULL;
@@ -404,6 +411,8 @@ static struct css_set *find_css_set(
        struct list_head tmp_cg_links;
        struct cg_cgroup_link *link;
+        struct hlist_head *hhead;
        /* First see if we already have a cgroup group that matches
         * the desired set */
        write_lock(&css_set_lock);
@@ -428,6 +437,7 @@ static struct css_set *find_css_set(
        kref_init(&res->ref);
        INIT_LIST_HEAD(&res->cg_links);
        INIT_LIST_HEAD(&res->tasks);
+        INIT_HLIST_NODE(&res->hlist);
        /* Copy the set of subsystem state objects generated in
         * find_existing_css_set() */
@@ -467,9 +477,12 @@ static struct css_set *find_css_set(
        BUG_ON(!list_empty(&tmp_cg_links));
-        /* Link this cgroup group into the list */
-        list_add(&res->list, &init_css_set.list);
        css_set_count++;
+        /* Add this cgroup group to the hash table */
+        hhead = css_set_hash(res->subsys);
+        hlist_add_head(&res->hlist, hhead);
        write_unlock(&css_set_lock);
        return res;
@@ -948,7 +961,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        int ret = 0;
        struct super_block *sb;
        struct cgroupfs_root *root;
-        struct list_head tmp_cg_links, *l;
+        struct list_head tmp_cg_links;
        INIT_LIST_HEAD(&tmp_cg_links);
        /* First find the desired set of subsystems */
@@ -990,6 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                /* New superblock */
                struct cgroup *cgrp = &root->top_cgroup;
                struct inode *inode;
+                int i;
                BUG_ON(sb->s_root != NULL);
@@ -1034,22 +1048,25 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                /* Link the top cgroup in this hierarchy into all
                 * the css_set objects */
                write_lock(&css_set_lock);
-                l = &init_css_set.list;
+                for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
-                do {
+                        struct hlist_head *hhead = &css_set_table[i];
+                        struct hlist_node *node;
                        struct css_set *cg;
-                        struct cg_cgroup_link *link;
-                        cg = list_entry(l, struct css_set, list);
+                        hlist_for_each_entry(cg, node, hhead, hlist) {
-                        BUG_ON(list_empty(&tmp_cg_links));
+                                struct cg_cgroup_link *link;
-                        link = list_entry(tmp_cg_links.next,
-                                          struct cg_cgroup_link,
+                                BUG_ON(list_empty(&tmp_cg_links));
-                                          cgrp_link_list);
+                                link = list_entry(tmp_cg_links.next,
-                        list_del(&link->cgrp_link_list);
+                                                  struct cg_cgroup_link,
-                        link->cg = cg;
+                                                  cgrp_link_list);
-                        list_add(&link->cgrp_link_list,
+                                list_del(&link->cgrp_link_list);
-                                 &root->top_cgroup.css_sets);
+                                link->cg = cg;
-                        list_add(&link->cg_link_list, &cg->cg_links);
+                                list_add(&link->cgrp_link_list,
-                        l = l->next;
+                                         &root->top_cgroup.css_sets);
-                } while (l != &init_css_set.list);
+                                list_add(&link->cg_link_list, &cg->cg_links);
+                        }
+                }
                write_unlock(&css_set_lock);
                free_cg_links(&tmp_cg_links);
@@ -1307,18 +1324,16 @@ enum cgroup_filetype {
        FILE_DIR,
        FILE_TASKLIST,
        FILE_NOTIFY_ON_RELEASE,
-        FILE_RELEASABLE,
        FILE_RELEASE_AGENT,
 };
-static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
+static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
-                                 struct file *file,
+                                struct file *file,
-                                 const char __user *userbuf,
+                                const char __user *userbuf,
-                                 size_t nbytes, loff_t *unused_ppos)
+                                size_t nbytes, loff_t *unused_ppos)
 {
        char buffer[64];
        int retval = 0;
-        u64 val;
        char *end;
        if (!nbytes)
@@ -1329,16 +1344,18 @@ static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
                return -EFAULT;
        buffer[nbytes] = 0;     /* nul-terminate */
+        strstrip(buffer);
-        /* strip newline if necessary */
+        if (cft->write_u64) {
-        if (nbytes && (buffer[nbytes-1] == '\n'))
+                u64 val = simple_strtoull(buffer, &end, 0);
-                buffer[nbytes-1] = 0;
+                if (*end)
-        val = simple_strtoull(buffer, &end, 0);
+                        return -EINVAL;
-        if (*end)
+                retval = cft->write_u64(cgrp, cft, val);
-                return -EINVAL;
+        } else {
+                s64 val = simple_strtoll(buffer, &end, 0);
-        /* Pass to subsystem */
+                if (*end)
-        retval = cft->write_uint(cgrp, cft, val);
+                        return -EINVAL;
+                retval = cft->write_s64(cgrp, cft, val);
+        }
        if (!retval)
                retval = nbytes;
        return retval;
@@ -1419,23 +1436,39 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
                return -ENODEV;
        if (cft->write)
                return cft->write(cgrp, cft, file, buf, nbytes, ppos);
-        if (cft->write_uint)
+        if (cft->write_u64 || cft->write_s64)
-                return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos);
+                return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
+        if (cft->trigger) {
+                int ret = cft->trigger(cgrp, (unsigned int)cft->private);
+                return ret ? ret : nbytes;
+        }
        return -EINVAL;
 }
-static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft,
+static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
-                                   struct file *file,
+                               struct file *file,
-                                   char __user *buf, size_t nbytes,
+                               char __user *buf, size_t nbytes,
-                                   loff_t *ppos)
+                               loff_t *ppos)
 {
        char tmp[64];
-        u64 val = cft->read_uint(cgrp, cft);
+        u64 val = cft->read_u64(cgrp, cft);
        int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
+static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
+                               struct file *file,
+                               char __user *buf, size_t nbytes,
+                               loff_t *ppos)
+{
+        char tmp[64];
+        s64 val = cft->read_s64(cgrp, cft);
+        int len = sprintf(tmp, "%lld\n", (long long) val);
+        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+}
 static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
                                          struct cftype *cft,
                                          struct file *file,
@@ -1490,11 +1523,56 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
        if (cft->read)
                return cft->read(cgrp, cft, file, buf, nbytes, ppos);
-        if (cft->read_uint)
+        if (cft->read_u64)
-                return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos);
+                return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
+        if (cft->read_s64)
+                return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
        return -EINVAL;
 }
+/*
+ * seqfile ops/methods for returning structured data. Currently just
+ * supports string->u64 maps, but can be extended in future.
+ */
+struct cgroup_seqfile_state {
+        struct cftype *cft;
+        struct cgroup *cgroup;
+};
+static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
+{
+        struct seq_file *sf = cb->state;
+        return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
+}
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+        struct cgroup_seqfile_state *state = m->private;
+        struct cftype *cft = state->cft;
+        if (cft->read_map) {
+                struct cgroup_map_cb cb = {
+                        .fill = cgroup_map_add,
+                        .state = m,
+                };
+                return cft->read_map(state->cgroup, cft, &cb);
+        }
+        return cft->read_seq_string(state->cgroup, cft, m);
+}
+int cgroup_seqfile_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = file->private_data;
+        kfree(seq->private);
+        return single_release(inode, file);
+}
+static struct file_operations cgroup_seqfile_operations = {
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = cgroup_seqfile_release,
+};
 static int cgroup_file_open(struct inode *inode, struct file *file)
 {
        int err;
@@ -1507,7 +1585,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
        cft = __d_cft(file->f_dentry);
        if (!cft)
                return -ENODEV;
-        if (cft->open)
+        if (cft->read_map || cft->read_seq_string) {
+                struct cgroup_seqfile_state *state =
+                        kzalloc(sizeof(*state), GFP_USER);
+                if (!state)
+                        return -ENOMEM;
+                state->cft = cft;
+                state->cgroup = __d_cgrp(file->f_dentry->d_parent);
+                file->f_op = &cgroup_seqfile_operations;
+                err = single_open(file, cgroup_seqfile_show, state);
+                if (err < 0)
+                        kfree(state);
+        } else if (cft->open)
                err = cft->open(inode, file);
        else
                err = 0;
@@ -1715,7 +1804,7 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
 * The tasklist_lock is not held here, as do_each_thread() and
 * while_each_thread() are protected by RCU.
 */
-void cgroup_enable_task_cg_lists(void)
+static void cgroup_enable_task_cg_lists(void)
 {
        struct task_struct *p, *g;
        write_lock(&css_set_lock);
@@ -1913,14 +2002,14 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
        if (heap->size) {
                for (i = 0; i < heap->size; i++) {
-                        struct task_struct *p = heap->ptrs[i];
+                        struct task_struct *q = heap->ptrs[i];
                        if (i == 0) {
-                                latest_time = p->start_time;
+                                latest_time = q->start_time;
-                                latest_task = p;
+                                latest_task = q;
                        }
                        /* Process the task per the caller's callback */
-                        scan->process_task(p, scan);
+                        scan->process_task(q, scan);
-                        put_task_struct(p);
+                        put_task_struct(q);
                }
                /*
                 * If we had to process any tasks at all, scan again
@@ -2138,11 +2227,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
        return notify_on_release(cgrp);
 }
-static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
-{
-        return test_bit(CGRP_RELEASABLE, &cgrp->flags);
-}
 /*
 * for the common functions, 'private' gives the type of file
 */
@@ -2158,16 +2242,10 @@ static struct cftype files[] = {
        {
                .name = "notify_on_release",
-                .read_uint = cgroup_read_notify_on_release,
+                .read_u64 = cgroup_read_notify_on_release,
                .write = cgroup_common_file_write,
                .private = FILE_NOTIFY_ON_RELEASE,
        },
-        {
-                .name = "releasable",
-                .read_uint = cgroup_read_releasable,
-                .private = FILE_RELEASABLE,
-        }
 };
 static struct cftype cft_release_agent = {
@@ -2401,10 +2479,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        return 0;
 }
-static void cgroup_init_subsys(struct cgroup_subsys *ss)
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
        struct cgroup_subsys_state *css;
-        struct list_head *l;
        printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
@@ -2415,34 +2492,19 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss)
        BUG_ON(IS_ERR(css));
        init_cgroup_css(css, ss, dummytop);
-        /* Update all cgroup groups to contain a subsys
+        /* Update the init_css_set to contain a subsys
         * pointer to this state - since the subsystem is
-         * newly registered, all tasks and hence all cgroup
+         * newly registered, all tasks and hence the
-         * groups are in the subsystem's top cgroup. */
+         * init_css_set is in the subsystem's top cgroup. */
-        write_lock(&css_set_lock);
+        init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
-        l = &init_css_set.list;
-        do {
-                struct css_set *cg =
-                        list_entry(l, struct css_set, list);
-                cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
-                l = l->next;
-        } while (l != &init_css_set.list);
-        write_unlock(&css_set_lock);
-        /* If this subsystem requested that it be notified with fork
-         * events, we should send it one now for every process in the
-         * system */
-        if (ss->fork) {
-                struct task_struct *g, *p;
-                read_lock(&tasklist_lock);
-                do_each_thread(g, p) {
-                        ss->fork(ss, p);
-                } while_each_thread(g, p);
-                read_unlock(&tasklist_lock);
-        }
        need_forkexit_callback |= ss->fork || ss->exit;
+        need_mm_owner_callback |= !!ss->mm_owner_changed;
+        /* At system boot, before all subsystems have been
+         * registered, no tasks have been forked, so we don't
+         * need to invoke fork callbacks here. */
+        BUG_ON(!list_empty(&init_task.tasks));
        ss->active = 1;
 }
@@ -2458,9 +2520,9 @@ int __init cgroup_init_early(void)
        int i;
        kref_init(&init_css_set.ref);
        kref_get(&init_css_set.ref);
-        INIT_LIST_HEAD(&init_css_set.list);
        INIT_LIST_HEAD(&init_css_set.cg_links);
        INIT_LIST_HEAD(&init_css_set.tasks);
+        INIT_HLIST_NODE(&init_css_set.hlist);
        css_set_count = 1;
        init_cgroup_root(&rootnode);
        list_add(&rootnode.root_list, &roots);
@@ -2473,6 +2535,9 @@ int __init cgroup_init_early(void)
        list_add(&init_css_set_link.cg_link_list,
                 &init_css_set.cg_links);
+        for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
+                INIT_HLIST_HEAD(&css_set_table[i]);
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
@@ -2502,7 +2567,7 @@ int __init cgroup_init(void)
 {
        int err;
        int i;
-        struct proc_dir_entry *entry;
+        struct hlist_head *hhead;
        err = bdi_init(&cgroup_backing_dev_info);
        if (err)
@@ -2514,13 +2579,15 @@ int __init cgroup_init(void)
                        cgroup_init_subsys(ss);
        }
+        /* Add init_css_set to the hash table */
+        hhead = css_set_hash(init_css_set.subsys);
+        hlist_add_head(&init_css_set.hlist, hhead);
        err = register_filesystem(&cgroup_fs_type);
        if (err < 0)
                goto out;
-        entry = create_proc_entry("cgroups", 0, NULL);
+        proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
-        if (entry)
-                entry->proc_fops = &proc_cgroupstats_operations;
 out:
        if (err)
@@ -2683,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child)
        }
 }
+#ifdef CONFIG_MM_OWNER
+/**
+ * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
+ * @p: the new owner
+ *
+ * Called on every change to mm->owner. mm_init_owner() does not
+ * invoke this routine, since it assigns the mm->owner the first time
+ * and does not change it.
+ */
+void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
+{
+        struct cgroup *oldcgrp, *newcgrp;
+        if (need_mm_owner_callback) {
+                int i;
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        oldcgrp = task_cgroup(old, ss->subsys_id);
+                        newcgrp = task_cgroup(new, ss->subsys_id);
+                        if (oldcgrp == newcgrp)
+                                continue;
+                        if (ss->mm_owner_changed)
+                                ss->mm_owner_changed(ss, oldcgrp, newcgrp);
+                }
+        }
+}
+#endif /* CONFIG_MM_OWNER */
 /**
 * cgroup_post_fork - called on a new task after adding it to the task list
 * @child: the task in question
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index 37301e877cb0..c3dc3aba4c02 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -1,5 +1,5 @@
 /*
- * kernel/ccontainer_debug.c - Example cgroup subsystem that
+ * kernel/cgroup_debug.c - Example cgroup subsystem that
 * exposes debug info
 *
 * Copyright (C) Google Inc, 2007
@@ -62,25 +62,35 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
        return count;
 }
+static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+{
+        return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+}
 static struct cftype files[] =  {
        {
                .name = "cgroup_refcount",
-                .read_uint = cgroup_refcount_read,
+                .read_u64 = cgroup_refcount_read,
        },
        {
                .name = "taskcount",
-                .read_uint = taskcount_read,
+                .read_u64 = taskcount_read,
        },
        {
                .name = "current_css_set",
-                .read_uint = current_css_set_read,
+                .read_u64 = current_css_set_read,
        },
        {
                .name = "current_css_set_refcount",
-                .read_uint = current_css_set_refcount_read,
+                .read_u64 = current_css_set_refcount_read,
        },
+        {
+                .name = "releasable",
+                .read_u64 = releasable_read,
+        }
 };
 static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/configs.c b/kernel/configs.c
index e84d3f9c6c7b..4c345210ed8c 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,12 +79,11 @@ static int __init ikconfig_init(void)
        struct proc_dir_entry *entry;
        /* create the current config file */
-        entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO,
+        entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
-                                  &proc_root);
+                            &ikconfig_file_ops);
        if (!entry)
                return -ENOMEM;
-        entry->proc_fops = &ikconfig_file_ops;
        entry->size = kernel_config_data_size;
        return 0;
@@ -95,7 +94,7 @@ static int __init ikconfig_init(void)
 static void __exit ikconfig_cleanup(void)
 {
-        remove_proc_entry("config.gz", &proc_root);
+        remove_proc_entry("config.gz", NULL);
 }
 module_init(ikconfig_init);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2011ad8d2697..a98f6ab16ecd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -33,17 +33,13 @@ static struct {
         * an ongoing cpu hotplug operation.
         */
        int refcount;
-        wait_queue_head_t writer_queue;
 } cpu_hotplug;
-#define writer_exists() (cpu_hotplug.active_writer != NULL)
 void __init cpu_hotplug_init(void)
 {
        cpu_hotplug.active_writer = NULL;
        mutex_init(&cpu_hotplug.lock);
        cpu_hotplug.refcount = 0;
-        init_waitqueue_head(&cpu_hotplug.writer_queue);
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -65,11 +61,8 @@ void put_online_cpus(void)
        if (cpu_hotplug.active_writer == current)
                return;
        mutex_lock(&cpu_hotplug.lock);
-        cpu_hotplug.refcount--;
+        if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
+                wake_up_process(cpu_hotplug.active_writer);
-        if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
-                wake_up(&cpu_hotplug.writer_queue);
        mutex_unlock(&cpu_hotplug.lock);
 }
@@ -98,8 +91,8 @@ void cpu_maps_update_done(void)
 * Note that during a cpu-hotplug operation, the new readers, if any,
 * will be blocked by the cpu_hotplug.lock
 *
- * Since cpu_maps_update_begin is always called after invoking
+ * Since cpu_hotplug_begin() is always called after invoking
- * cpu_maps_update_begin, we can be sure that only one writer is active.
+ * cpu_maps_update_begin(), we can be sure that only one writer is active.
 *
 * Note that theoretically, there is a possibility of a livelock:
 * - Refcount goes to zero, last reader wakes up the sleeping
@@ -115,19 +108,16 @@ void cpu_maps_update_done(void)
 */
 static void cpu_hotplug_begin(void)
 {
-        DECLARE_WAITQUEUE(wait, current);
-        mutex_lock(&cpu_hotplug.lock);
        cpu_hotplug.active_writer = current;
-        add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
-        while (cpu_hotplug.refcount) {
+        for (;;) {
-                set_current_state(TASK_UNINTERRUPTIBLE);
+                mutex_lock(&cpu_hotplug.lock);
+                if (likely(!cpu_hotplug.refcount))
+                        break;
+                __set_current_state(TASK_UNINTERRUPTIBLE);
                mutex_unlock(&cpu_hotplug.lock);
                schedule();
-                mutex_lock(&cpu_hotplug.lock);
        }
-        remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
 }
 static void cpu_hotplug_done(void)
@@ -136,7 +126,7 @@ static void cpu_hotplug_done(void)
        mutex_unlock(&cpu_hotplug.lock);
 }
 /* Need to know about CPUs going up/down? */
-int __cpuinit register_cpu_notifier(struct notifier_block *nb)
+int __ref register_cpu_notifier(struct notifier_block *nb)
 {
        int ret;
        cpu_maps_update_begin();
@@ -149,7 +139,7 @@ int __cpuinit register_cpu_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL(register_cpu_notifier);
-void unregister_cpu_notifier(struct notifier_block *nb)
+void __ref unregister_cpu_notifier(struct notifier_block *nb)
 {
        cpu_maps_update_begin();
        raw_notifier_chain_unregister(&cpu_chain, nb);
@@ -180,7 +170,7 @@ struct take_cpu_down_param {
 };
 /* Take this CPU down. */
-static int take_cpu_down(void *_param)
+static int __ref take_cpu_down(void *_param)
 {
        struct take_cpu_down_param *param = _param;
        int err;
@@ -199,7 +189,7 @@ static int take_cpu_down(void *_param)
 }
 /* Requires cpu_add_remove_lock to be held */
-static int _cpu_down(unsigned int cpu, int tasks_frozen)
+static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
        int err, nr_calls = 0;
        struct task_struct *p;
@@ -274,7 +264,7 @@ out_release:
        return err;
 }
-int cpu_down(unsigned int cpu)
+int __ref cpu_down(unsigned int cpu)
 {
        int err = 0;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 48a976c52cf5..8da627d33804 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -127,6 +127,7 @@ struct cpuset_hotplug_scanner {
 typedef enum {
        CS_CPU_EXCLUSIVE,
        CS_MEM_EXCLUSIVE,
+        CS_MEM_HARDWALL,
        CS_MEMORY_MIGRATE,
        CS_SCHED_LOAD_BALANCE,
        CS_SPREAD_PAGE,
@@ -144,6 +145,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
        return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
 }
+static inline int is_mem_hardwall(const struct cpuset *cs)
+{
+        return test_bit(CS_MEM_HARDWALL, &cs->flags);
+}
 static inline int is_sched_load_balance(const struct cpuset *cs)
 {
        return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
@@ -735,7 +741,8 @@ static inline int started_after(void *p1, void *p2)
 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
 * words, if its mask is not equal to its cpuset's mask).
 */
-int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
+static int cpuset_test_cpumask(struct task_struct *tsk,
+                               struct cgroup_scanner *scan)
 {
        return !cpus_equal(tsk->cpus_allowed,
                        (cgroup_cs(scan->cg))->cpus_allowed);
@@ -752,7 +759,8 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
 * We don't need to re-check for the cgroup/cpuset membership, since we're
 * holding cgroup_lock() at this point.
 */
-void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
+static void cpuset_change_cpumask(struct task_struct *tsk,
+                                  struct cgroup_scanner *scan)
 {
        set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
 }
@@ -1023,19 +1031,6 @@ int current_cpuset_is_being_rebound(void)
        return task_cs(current) == cpuset_being_rebound;
 }
-/*
- * Call with cgroup_mutex held.
- */
-static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
-{
-        if (simple_strtoul(buf, NULL, 10) != 0)
-                cpuset_memory_pressure_enabled = 1;
-        else
-                cpuset_memory_pressure_enabled = 0;
-        return 0;
-}
 static int update_relax_domain_level(struct cpuset *cs, char *buf)
 {
        int val = simple_strtol(buf, NULL, 10);
@@ -1053,25 +1048,20 @@ static int update_relax_domain_level(struct cpuset *cs, char *buf)
 /*
 * update_flag - read a 0 or a 1 in a file and update associated flag
- * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
+ * bit:         the bit to update (see cpuset_flagbits_t)
- *                              CS_SCHED_LOAD_BALANCE,
+ * cs:          the cpuset to update
- *                              CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
+ * turning_on:  whether the flag is being set or cleared
- *                              CS_SPREAD_PAGE, CS_SPREAD_SLAB)
- * cs:  the cpuset to update
- * buf: the buffer where we read the 0 or 1
 *
 * Call with cgroup_mutex held.
 */
-static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+                       int turning_on)
 {
-        int turning_on;
        struct cpuset trialcs;
        int err;
        int cpus_nonempty, balance_flag_changed;
-        turning_on = (simple_strtoul(buf, NULL, 10) != 0);
        trialcs = *cs;
        if (turning_on)
                set_bit(bit, &trialcs.flags);
@@ -1241,6 +1231,7 @@ typedef enum {
        FILE_MEMLIST,
        FILE_CPU_EXCLUSIVE,
        FILE_MEM_EXCLUSIVE,
+        FILE_MEM_HARDWALL,
        FILE_SCHED_LOAD_BALANCE,
        FILE_SCHED_RELAX_DOMAIN_LEVEL,
        FILE_MEMORY_PRESSURE_ENABLED,
@@ -1289,46 +1280,71 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
        case FILE_MEMLIST:
                retval = update_nodemask(cs, buffer);
                break;
+        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+                retval = update_relax_domain_level(cs, buffer);
+                break;
+        default:
+                retval = -EINVAL;
+                goto out2;
+        }
+        if (retval == 0)
+                retval = nbytes;
+out2:
+        cgroup_unlock();
+out1:
+        kfree(buffer);
+        return retval;
+}
+static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+        int retval = 0;
+        struct cpuset *cs = cgroup_cs(cgrp);
+        cpuset_filetype_t type = cft->private;
+        cgroup_lock();
+        if (cgroup_is_removed(cgrp)) {
+                cgroup_unlock();
+                return -ENODEV;
+        }
+        switch (type) {
        case FILE_CPU_EXCLUSIVE:
-                retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
+                retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
                break;
        case FILE_MEM_EXCLUSIVE:
-                retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
+                retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
                break;
-        case FILE_SCHED_LOAD_BALANCE:
+        case FILE_MEM_HARDWALL:
-                retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
+                retval = update_flag(CS_MEM_HARDWALL, cs, val);
                break;
-        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+        case FILE_SCHED_LOAD_BALANCE:
-                retval = update_relax_domain_level(cs, buffer);
+                retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
                break;
        case FILE_MEMORY_MIGRATE:
-                retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
+                retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
                break;
        case FILE_MEMORY_PRESSURE_ENABLED:
-                retval = update_memory_pressure_enabled(cs, buffer);
+                cpuset_memory_pressure_enabled = !!val;
                break;
        case FILE_MEMORY_PRESSURE:
                retval = -EACCES;
                break;
        case FILE_SPREAD_PAGE:
-                retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
+                retval = update_flag(CS_SPREAD_PAGE, cs, val);
                cs->mems_generation = cpuset_mems_generation++;
                break;
        case FILE_SPREAD_SLAB:
-                retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
+                retval = update_flag(CS_SPREAD_SLAB, cs, val);
                cs->mems_generation = cpuset_mems_generation++;
                break;
        default:
                retval = -EINVAL;
-                goto out2;
+                break;
        }
-        if (retval == 0)
-                retval = nbytes;
-out2:
        cgroup_unlock();
-out1:
-        kfree(buffer);
        return retval;
 }
@@ -1390,33 +1406,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
        case FILE_MEMLIST:
                s += cpuset_sprintf_memlist(s, cs);
                break;
-        case FILE_CPU_EXCLUSIVE:
-                *s++ = is_cpu_exclusive(cs) ? '1' : '0';
-                break;
-        case FILE_MEM_EXCLUSIVE:
-                *s++ = is_mem_exclusive(cs) ? '1' : '0';
-                break;
-        case FILE_SCHED_LOAD_BALANCE:
-                *s++ = is_sched_load_balance(cs) ? '1' : '0';
-                break;
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                s += sprintf(s, "%d", cs->relax_domain_level);
                break;
-        case FILE_MEMORY_MIGRATE:
-                *s++ = is_memory_migrate(cs) ? '1' : '0';
-                break;
-        case FILE_MEMORY_PRESSURE_ENABLED:
-                *s++ = cpuset_memory_pressure_enabled ? '1' : '0';
-                break;
-        case FILE_MEMORY_PRESSURE:
-                s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
-                break;
-        case FILE_SPREAD_PAGE:
-                *s++ = is_spread_page(cs) ? '1' : '0';
-                break;
-        case FILE_SPREAD_SLAB:
-                *s++ = is_spread_slab(cs) ? '1' : '0';
-                break;
        default:
                retval = -EINVAL;
                goto out;
@@ -1429,121 +1421,137 @@ out:
        return retval;
 }
+static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
+{
+        struct cpuset *cs = cgroup_cs(cont);
+        cpuset_filetype_t type = cft->private;
+        switch (type) {
+        case FILE_CPU_EXCLUSIVE:
+                return is_cpu_exclusive(cs);
+        case FILE_MEM_EXCLUSIVE:
+                return is_mem_exclusive(cs);
+        case FILE_MEM_HARDWALL:
+                return is_mem_hardwall(cs);
+        case FILE_SCHED_LOAD_BALANCE:
+                return is_sched_load_balance(cs);
+        case FILE_MEMORY_MIGRATE:
+                return is_memory_migrate(cs);
+        case FILE_MEMORY_PRESSURE_ENABLED:
+                return cpuset_memory_pressure_enabled;
+        case FILE_MEMORY_PRESSURE:
+                return fmeter_getrate(&cs->fmeter);
+        case FILE_SPREAD_PAGE:
+                return is_spread_page(cs);
+        case FILE_SPREAD_SLAB:
+                return is_spread_slab(cs);
+        default:
+                BUG();
+        }
+}
 /*
 * for the common functions, 'private' gives the type of file
 */
-static struct cftype cft_cpus = {
+static struct cftype files[] = {
-        .name = "cpus",
+        {
-        .read = cpuset_common_file_read,
+                .name = "cpus",
-        .write = cpuset_common_file_write,
+                .read = cpuset_common_file_read,
-        .private = FILE_CPULIST,
+                .write = cpuset_common_file_write,
-};
+                .private = FILE_CPULIST,
+        },
-static struct cftype cft_mems = {
-        .name = "mems",
+        {
-        .read = cpuset_common_file_read,
+                .name = "mems",
-        .write = cpuset_common_file_write,
+                .read = cpuset_common_file_read,
-        .private = FILE_MEMLIST,
+                .write = cpuset_common_file_write,
-};
+                .private = FILE_MEMLIST,
+        },
-static struct cftype cft_cpu_exclusive = {
-        .name = "cpu_exclusive",
+        {
-        .read = cpuset_common_file_read,
+                .name = "cpu_exclusive",
-        .write = cpuset_common_file_write,
+                .read_u64 = cpuset_read_u64,
-        .private = FILE_CPU_EXCLUSIVE,
+                .write_u64 = cpuset_write_u64,
-};
+                .private = FILE_CPU_EXCLUSIVE,
+        },
-static struct cftype cft_mem_exclusive = {
-        .name = "mem_exclusive",
+        {
-        .read = cpuset_common_file_read,
+                .name = "mem_exclusive",
-        .write = cpuset_common_file_write,
+                .read_u64 = cpuset_read_u64,
-        .private = FILE_MEM_EXCLUSIVE,
+                .write_u64 = cpuset_write_u64,
-};
+                .private = FILE_MEM_EXCLUSIVE,
+        },
-static struct cftype cft_sched_load_balance = {
-        .name = "sched_load_balance",
+        {
-        .read = cpuset_common_file_read,
+                .name = "mem_hardwall",
-        .write = cpuset_common_file_write,
+                .read_u64 = cpuset_read_u64,
-        .private = FILE_SCHED_LOAD_BALANCE,
+                .write_u64 = cpuset_write_u64,
-};
+                .private = FILE_MEM_HARDWALL,
+        },
-static struct cftype cft_sched_relax_domain_level = {
-        .name = "sched_relax_domain_level",
+        {
-        .read = cpuset_common_file_read,
+                .name = "sched_load_balance",
-        .write = cpuset_common_file_write,
+                .read_u64 = cpuset_read_u64,
-        .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+                .write_u64 = cpuset_write_u64,
-};
+                .private = FILE_SCHED_LOAD_BALANCE,
+        },
-static struct cftype cft_memory_migrate = {
-        .name = "memory_migrate",
+        {
-        .read = cpuset_common_file_read,
+                .name = "sched_relax_domain_level",
-        .write = cpuset_common_file_write,
+                .read_u64 = cpuset_read_u64,
-        .private = FILE_MEMORY_MIGRATE,
+                .write_u64 = cpuset_write_u64,
+                .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+        },
+        {
+                .name = "memory_migrate",
+                .read_u64 = cpuset_read_u64,
+                .write_u64 = cpuset_write_u64,
+                .private = FILE_MEMORY_MIGRATE,
+        },
+        {
+                .name = "memory_pressure",
+                .read_u64 = cpuset_read_u64,
+                .write_u64 = cpuset_write_u64,
+                .private = FILE_MEMORY_PRESSURE,
+        },
+        {
+                .name = "memory_spread_page",
+                .read_u64 = cpuset_read_u64,
+                .write_u64 = cpuset_write_u64,
+                .private = FILE_SPREAD_PAGE,
+        },
+        {
+                .name = "memory_spread_slab",
+                .read_u64 = cpuset_read_u64,
+                .write_u64 = cpuset_write_u64,
+                .private = FILE_SPREAD_SLAB,
+        },
 };
 static struct cftype cft_memory_pressure_enabled = {
        .name = "memory_pressure_enabled",
-        .read = cpuset_common_file_read,
+        .read_u64 = cpuset_read_u64,
-        .write = cpuset_common_file_write,
+        .write_u64 = cpuset_write_u64,
        .private = FILE_MEMORY_PRESSURE_ENABLED,
 };
-static struct cftype cft_memory_pressure = {
-        .name = "memory_pressure",
-        .read = cpuset_common_file_read,
-        .write = cpuset_common_file_write,
-        .private = FILE_MEMORY_PRESSURE,
-};
-static struct cftype cft_spread_page = {
-        .name = "memory_spread_page",
-        .read = cpuset_common_file_read,
-        .write = cpuset_common_file_write,
-        .private = FILE_SPREAD_PAGE,
-};
-static struct cftype cft_spread_slab = {
-        .name = "memory_spread_slab",
-        .read = cpuset_common_file_read,
-        .write = cpuset_common_file_write,
-        .private = FILE_SPREAD_SLAB,
-};
 static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
        int err;
-        if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0)
+        err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
-                return err;
+        if (err)
-        if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss,
-                                        &cft_sched_relax_domain_level)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
                return err;
        /* memory_pressure_enabled is in root cpuset only */
-        if (err == 0 && !cont->parent)
+        if (!cont->parent)
                err = cgroup_add_file(cont, ss,
-                                         &cft_memory_pressure_enabled);
+                                      &cft_memory_pressure_enabled);
-        return 0;
+        return err;
 }
 /*
@@ -1643,7 +1651,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
        cpuset_update_task_memory_state();
        if (is_sched_load_balance(cs))
-                update_flag(CS_SCHED_LOAD_BALANCE, cs, "0");
+                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
        number_of_cpusets--;
        kfree(cs);
@@ -1708,7 +1716,8 @@ int __init cpuset_init(void)
 * Called by cgroup_scan_tasks() for each task in a cgroup.
 * Return nonzero to stop the walk through the tasks.
 */
-void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
+static void cpuset_do_move_task(struct task_struct *tsk,
+                                struct cgroup_scanner *scan)
 {
        struct cpuset_hotplug_scanner *chsp;
@@ -1970,14 +1979,14 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 }
 /*
- * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
+ * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
- * ancestor to the specified cpuset.  Call holding callback_mutex.
+ * mem_hardwall ancestor to the specified cpuset.  Call holding
- * If no ancestor is mem_exclusive (an unusual configuration), then
+ * callback_mutex.  If no ancestor is mem_exclusive or mem_hardwall
- * returns the root cpuset.
+ * (an unusual configuration), then returns the root cpuset.
 */
-static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
+static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 {
-        while (!is_mem_exclusive(cs) && cs->parent)
+        while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
                cs = cs->parent;
        return cs;
 }
@@ -1991,7 +2000,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 * __GFP_THISNODE is set, yes, we can always allocate.  If zone
 * z's node is in our tasks mems_allowed, yes.  If it's not a
 * __GFP_HARDWALL request and this zone's nodes is in the nearest
- * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * hardwalled cpuset ancestor to this tasks cpuset, yes.
 * If the task has been OOM killed and has access to memory reserves
 * as specified by the TIF_MEMDIE flag, yes.
 * Otherwise, no.
@@ -2014,7 +2023,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 * and do not allow allocations outside the current tasks cpuset
 * unless the task has been OOM killed as is marked TIF_MEMDIE.
 * GFP_KERNEL allocations are not so marked, so can escape to the
- * nearest enclosing mem_exclusive ancestor cpuset.
+ * nearest enclosing hardwalled ancestor cpuset.
 *
 * Scanning up parent cpusets requires callback_mutex.  The
 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
@@ -2037,7 +2046,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 *      in_interrupt - any node ok (current task context irrelevant)
 *      GFP_ATOMIC   - any node ok
 *      TIF_MEMDIE   - any node ok
- *      GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
+ *      GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
 *
 * Rule:
@@ -2074,7 +2083,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
        mutex_lock(&callback_mutex);
        task_lock(current);
-        cs = nearest_exclusive_ancestor(task_cs(current));
+        cs = nearest_hardwall_ancestor(task_cs(current));
        task_unlock(current);
        allowed = node_isset(node, cs->mems_allowed);
diff --git a/kernel/dma.c b/kernel/dma.c
index 6a82bb716dac..d2c60a822790 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -149,12 +149,7 @@ static const struct file_operations proc_dma_operations = {
 static int __init proc_dma_init(void)
 {
-        struct proc_dir_entry *e;
+        proc_create("dma", 0, NULL, &proc_dma_operations);
-        e = create_proc_entry("dma", 0, NULL);
-        if (e)
-                e->proc_fops = &proc_dma_operations;
        return 0;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 2a9d98c641ac..ae0f2c4e452b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -557,6 +557,88 @@ void exit_fs(struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(exit_fs);
+#ifdef CONFIG_MM_OWNER
+/*
+ * Task p is exiting and it owned mm, lets find a new owner for it
+ */
+static inline int
+mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
+{
+        /*
+         * If there are other users of the mm and the owner (us) is exiting
+         * we need to find a new owner to take on the responsibility.
+         */
+        if (!mm)
+                return 0;
+        if (atomic_read(&mm->mm_users) <= 1)
+                return 0;
+        if (mm->owner != p)
+                return 0;
+        return 1;
+}
+void mm_update_next_owner(struct mm_struct *mm)
+{
+        struct task_struct *c, *g, *p = current;
+retry:
+        if (!mm_need_new_owner(mm, p))
+                return;
+        read_lock(&tasklist_lock);
+        /*
+         * Search in the children
+         */
+        list_for_each_entry(c, &p->children, sibling) {
+                if (c->mm == mm)
+                        goto assign_new_owner;
+        }
+        /*
+         * Search in the siblings
+         */
+        list_for_each_entry(c, &p->parent->children, sibling) {
+                if (c->mm == mm)
+                        goto assign_new_owner;
+        }
+        /*
+         * Search through everything else. We should not get
+         * here often
+         */
+        do_each_thread(g, c) {
+                if (c->mm == mm)
+                        goto assign_new_owner;
+        } while_each_thread(g, c);
+        read_unlock(&tasklist_lock);
+        return;
+assign_new_owner:
+        BUG_ON(c == p);
+        get_task_struct(c);
+        /*
+         * The task_lock protects c->mm from changing.
+         * We always want mm->owner->mm == mm
+         */
+        task_lock(c);
+        /*
+         * Delay read_unlock() till we have the task_lock()
+         * to ensure that c does not slip away underneath us
+         */
+        read_unlock(&tasklist_lock);
+        if (c->mm != mm) {
+                task_unlock(c);
+                put_task_struct(c);
+                goto retry;
+        }
+        cgroup_mm_owner_callbacks(mm->owner, c);
+        mm->owner = c;
+        task_unlock(c);
+        put_task_struct(c);
+}
+#endif /* CONFIG_MM_OWNER */
 /*
 * Turn us into a lazy TLB process if we
 * aren't already..
@@ -596,6 +678,7 @@ static void exit_mm(struct task_struct * tsk)
        /* We don't want this task to be frozen prematurely */
        clear_freeze_flag(tsk);
        task_unlock(tsk);
+        mm_update_next_owner(mm);
        mmput(mm);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 6067e429f281..068ffe007529 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -381,14 +381,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        mm->ioctx_list = NULL;
        mm->free_area_cache = TASK_UNMAPPED_BASE;
        mm->cached_hole_size = ~0UL;
-        mm_init_cgroup(mm, p);
+        mm_init_owner(mm, p);
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
                return mm;
        }
-        mm_free_cgroup(mm);
        free_mm(mm);
        return NULL;
 }
@@ -432,13 +431,13 @@ void mmput(struct mm_struct *mm)
        if (atomic_dec_and_test(&mm->mm_users)) {
                exit_aio(mm);
                exit_mmap(mm);
+                set_mm_exe_file(mm, NULL);
                if (!list_empty(&mm->mmlist)) {
                        spin_lock(&mmlist_lock);
                        list_del(&mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
                put_swap_token(mm);
-                mm_free_cgroup(mm);
                mmdrop(mm);
        }
 }
@@ -545,6 +544,8 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        if (init_new_context(tsk, mm))
                goto fail_nocontext;
+        dup_mm_exe_file(oldmm, mm);
        err = dup_mmap(mm, oldmm);
        if (err)
                goto free_pt;
@@ -982,6 +983,13 @@ static void rt_mutex_init_task(struct task_struct *p)
 #endif
 }
+#ifdef CONFIG_MM_OWNER
+void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+        mm->owner = p;
+}
+#endif /* CONFIG_MM_OWNER */
 /*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
@@ -1664,18 +1672,6 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
 }
 /*
- * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
- * supported yet
- */
-static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
-{
-        if (unshare_flags & CLONE_SYSVSEM)
-                return -EINVAL;
-        return 0;
-}
-/*
 * unshare allows a process to 'unshare' part of the process
 * context which was originally shared using clone.  copy_*
 * functions used by do_fork() cannot be used here directly
@@ -1690,8 +1686,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
        struct sighand_struct *new_sigh = NULL;
        struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
        struct files_struct *fd, *new_fd = NULL;
-        struct sem_undo_list *new_ulist = NULL;
        struct nsproxy *new_nsproxy = NULL;
+        int do_sysvsem = 0;
        check_unshare_flags(&unshare_flags);
@@ -1703,6 +1699,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                                CLONE_NEWNET))
                goto bad_unshare_out;
+        /*
+         * CLONE_NEWIPC must also detach from the undolist: after switching
+         * to a new ipc namespace, the semaphore arrays from the old
+         * namespace are unreachable.
+         */
+        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
+                do_sysvsem = 1;
        if ((err = unshare_thread(unshare_flags)))
                goto bad_unshare_out;
        if ((err = unshare_fs(unshare_flags, &new_fs)))
@@ -1713,13 +1716,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                goto bad_unshare_cleanup_sigh;
        if ((err = unshare_fd(unshare_flags, &new_fd)))
                goto bad_unshare_cleanup_vm;
-        if ((err = unshare_semundo(unshare_flags, &new_ulist)))
-                goto bad_unshare_cleanup_fd;
        if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
                        new_fs)))
-                goto bad_unshare_cleanup_semundo;
+                goto bad_unshare_cleanup_fd;
-        if (new_fs ||  new_mm || new_fd || new_ulist || new_nsproxy) {
+        if (new_fs ||  new_mm || new_fd || do_sysvsem || new_nsproxy) {
+                if (do_sysvsem) {
+                        /*
+                         * CLONE_SYSVSEM is equivalent to sys_exit().
+                         */
+                        exit_sem(current);
+                }
                if (new_nsproxy) {
                        switch_task_namespaces(current, new_nsproxy);
@@ -1755,7 +1762,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
        if (new_nsproxy)
                put_nsproxy(new_nsproxy);
-bad_unshare_cleanup_semundo:
 bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 6d9204f3a370..38a25b8d8bff 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,6 +1,7 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/device.h>
+#include <linux/gfp.h>
 /*
 * Device resource management aware IRQ request/free implementation.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 438a01464287..46e4ad1723f0 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
+#include <linux/slab.h>
 #include "internals.h"
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index f091d13def00..6fc0040f3e3a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -472,11 +472,7 @@ static const struct file_operations kallsyms_operations = {
 static int __init kallsyms_init(void)
 {
-        struct proc_dir_entry *entry;
+        proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
-        entry = create_proc_entry("kallsyms", 0444, NULL);
-        if (entry)
-                entry->proc_fops = &kallsyms_operations;
        return 0;
 }
 __initcall(kallsyms_init);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 92cf6930ab51..ac72eea48339 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -144,9 +144,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        spin_lock(&kthread_create_lock);
        list_add_tail(&create.list, &kthread_create_list);
-        wake_up_process(kthreadd_task);
        spin_unlock(&kthread_create_lock);
+        wake_up_process(kthreadd_task);
        wait_for_completion(&create.done);
        if (!IS_ERR(create.result)) {
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 7c74dab0d21b..5e7b45c56923 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -233,14 +233,7 @@ static struct file_operations lstats_fops = {
 static int __init init_lstats_procfs(void)
 {
-        struct proc_dir_entry *pe;
+        proc_create("latency_stats", 0644, NULL, &lstats_fops);
-        pe = create_proc_entry("latency_stats", 0644, NULL);
-        if (!pe)
-                return -ENOMEM;
-        pe->proc_fops = &lstats_fops;
        return 0;
 }
 __initcall(init_lstats_procfs);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 8a135bd163c2..dc5d29648d85 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -660,20 +660,12 @@ static const struct file_operations proc_lock_stat_operations = {
 static int __init lockdep_proc_init(void)
 {
-        struct proc_dir_entry *entry;
+        proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+        proc_create("lockdep_stats", S_IRUSR, NULL,
-        entry = create_proc_entry("lockdep", S_IRUSR, NULL);
+                    &proc_lockdep_stats_operations);
-        if (entry)
-                entry->proc_fops = &proc_lockdep_operations;
-        entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL);
-        if (entry)
-                entry->proc_fops = &proc_lockdep_stats_operations;
 #ifdef CONFIG_LOCK_STAT
-        entry = create_proc_entry("lock_stat", S_IRUSR, NULL);
+        proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations);
-        if (entry)
-                entry->proc_fops = &proc_lock_stat_operations;
 #endif
        return 0;
diff --git a/kernel/marker.c b/kernel/marker.c
index 005b95954593..139260e5460c 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -23,6 +23,7 @@
 #include <linux/rcupdate.h>
 #include <linux/marker.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 extern struct marker __start___markers[];
 extern struct marker __stop___markers[];
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 643360d1bb14..823be11584ef 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -31,6 +31,21 @@ static int notifier_chain_register(struct notifier_block **nl,
        return 0;
 }
+static int notifier_chain_cond_register(struct notifier_block **nl,
+                struct notifier_block *n)
+{
+        while ((*nl) != NULL) {
+                if ((*nl) == n)
+                        return 0;
+                if (n->priority > (*nl)->priority)
+                        break;
+                nl = &((*nl)->next);
+        }
+        n->next = *nl;
+        rcu_assign_pointer(*nl, n);
+        return 0;
+}
 static int notifier_chain_unregister(struct notifier_block **nl,
                struct notifier_block *n)
 {
@@ -205,6 +220,29 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
 EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
 /**
+ *      blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
+ *      @nh: Pointer to head of the blocking notifier chain
+ *      @n: New entry in notifier chain
+ *
+ *      Adds a notifier to a blocking notifier chain, only if not already
+ *      present in the chain.
+ *      Must be called in process context.
+ *
+ *      Currently always returns zero.
+ */
+int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
+                struct notifier_block *n)
+{
+        int ret;
+        down_write(&nh->rwsem);
+        ret = notifier_chain_cond_register(&nh->head, n);
+        up_write(&nh->rwsem);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
+/**
 *      blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
 *      @nh: Pointer to head of the blocking notifier chain
 *      @n: Entry to remove from notifier chain
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index aead4d69f62b..48d7ed6fc3a4 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,8 @@
 #include <linux/module.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
 struct ns_cgroup {
        struct cgroup_subsys_state css;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5d332cf8c63..adc785146a1c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -139,6 +139,18 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
                goto out;
        }
+        /*
+         * CLONE_NEWIPC must detach from the undolist: after switching
+         * to a new ipc namespace, the semaphore arrays from the old
+         * namespace are unreachable.  In clone parlance, CLONE_SYSVSEM
+         * means share undolist with parent, so we must forbid using
+         * it along with CLONE_NEWIPC.
+         */
+        if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
+                err = -EINVAL;
+                goto out;
+        }
        new_ns = create_new_namespaces(flags, tsk, tsk->fs);
        if (IS_ERR(new_ns)) {
                err = PTR_ERR(new_ns);
diff --git a/kernel/panic.c b/kernel/panic.c
index 24af9f8bac99..425567f45b9f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -153,6 +153,8 @@ EXPORT_SYMBOL(panic);
 *  'M' - System experienced a machine check exception.
 *  'B' - System has hit bad_page.
 *  'U' - Userspace-defined naughtiness.
+ *  'A' - ACPI table overridden.
+ *  'W' - Taint on warning.
 *
 *      The string is overwritten by the next call to print_taint().
 */
@@ -161,7 +163,7 @@ const char *print_tainted(void)
 {
        static char buf[20];
        if (tainted) {
-                snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c",
+                snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c",
                        tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
                        tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
                        tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
@@ -170,7 +172,8 @@ const char *print_tainted(void)
                        tainted & TAINT_BAD_PAGE ? 'B' : ' ',
                        tainted & TAINT_USER ? 'U' : ' ',
                        tainted & TAINT_DIE ? 'D' : ' ',
-                        tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ');
+                        tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ',
+                        tainted & TAINT_WARN ? 'W' : ' ');
        }
        else
                snprintf(buf, sizeof(buf), "Not tainted");
@@ -312,6 +315,7 @@ void warn_on_slowpath(const char *file, int line)
        print_modules();
        dump_stack();
        print_oops_end_marker();
+        add_taint(TAINT_WARN);
 }
 EXPORT_SYMBOL(warn_on_slowpath);
 #endif
diff --git a/kernel/printk.c b/kernel/printk.c
index bdd4ea8c3f2b..d3f9c0f788bf 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1287,31 +1287,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 */
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
-        static DEFINE_SPINLOCK(ratelimit_lock);
+        return __ratelimit(ratelimit_jiffies, ratelimit_burst);
-        static unsigned toks = 10 * 5 * HZ;
-        static unsigned long last_msg;
-        static int missed;
-        unsigned long flags;
-        unsigned long now = jiffies;
-        spin_lock_irqsave(&ratelimit_lock, flags);
-        toks += now - last_msg;
-        last_msg = now;
-        if (toks > (ratelimit_burst * ratelimit_jiffies))
-                toks = ratelimit_burst * ratelimit_jiffies;
-        if (toks >= ratelimit_jiffies) {
-                int lost = missed;
-                missed = 0;
-                toks -= ratelimit_jiffies;
-                spin_unlock_irqrestore(&ratelimit_lock, flags);
-                if (lost)
-                        printk(KERN_WARNING "printk: %d messages suppressed.\n", lost);
-                return 1;
-        }
-        missed++;
-        spin_unlock_irqrestore(&ratelimit_lock, flags);
-        return 0;
 }
 EXPORT_SYMBOL(__printk_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
index 606d7387265c..ae7ead82cbc9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -587,10 +587,10 @@ static int __init create_proc_profile(void)
                return 0;
        if (create_hash_tables())
                return -1;
-        entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
+        entry = proc_create("profile", S_IWUSR | S_IRUGO,
+                            NULL, &proc_profile_operations);
        if (!entry)
                return 0;
-        entry->proc_fops = &proc_profile_operations;
        entry->size = (1+prof_len) * sizeof(atomic_t);
        hotcpu_notifier(profile_cpu_callback, 0);
        return 0;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 47894f919d4e..33acc424667e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -45,6 +45,7 @@
 #include <linux/byteorder/swabb.h>
 #include <linux/stat.h>
 #include <linux/srcu.h>
+#include <linux/slab.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/relay.c b/kernel/relay.c
index dc873fba90d2..7de644cdec43 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -65,6 +65,35 @@ static struct vm_operations_struct relay_file_mmap_ops = {
        .close = relay_file_mmap_close,
 };
+/*
+ * allocate an array of pointers of struct page
+ */
+static struct page **relay_alloc_page_array(unsigned int n_pages)
+{
+        struct page **array;
+        size_t pa_size = n_pages * sizeof(struct page *);
+        if (pa_size > PAGE_SIZE) {
+                array = vmalloc(pa_size);
+                if (array)
+                        memset(array, 0, pa_size);
+        } else {
+                array = kzalloc(pa_size, GFP_KERNEL);
+        }
+        return array;
+}
+/*
+ * free an array of pointers of struct page
+ */
+static void relay_free_page_array(struct page **array)
+{
+        if (is_vmalloc_addr(array))
+                vfree(array);
+        else
+                kfree(array);
+}
 /**
 *      relay_mmap_buf: - mmap channel buffer to process address space
 *      @buf: relay channel buffer
@@ -109,7 +138,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
        *size = PAGE_ALIGN(*size);
        n_pages = *size >> PAGE_SHIFT;
-        buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
+        buf->page_array = relay_alloc_page_array(n_pages);
        if (!buf->page_array)
                return NULL;
@@ -130,7 +159,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
 depopulate:
        for (j = 0; j < i; j++)
                __free_page(buf->page_array[j]);
-        kfree(buf->page_array);
+        relay_free_page_array(buf->page_array);
        return NULL;
 }
@@ -189,7 +218,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
                vunmap(buf->start);
                for (i = 0; i < buf->page_count; i++)
                        __free_page(buf->page_array[i]);
-                kfree(buf->page_array);
+                relay_free_page_array(buf->page_array);
        }
        chan->buf[buf->cpu] = NULL;
        kfree(buf->padding);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index efbfc0fc232f..d3c61b4ebef2 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,6 +10,7 @@
 #include <linux/types.h>
 #include <linux/parser.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/res_counter.h>
 #include <linux/uaccess.h>
@@ -27,6 +28,8 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
        }
        counter->usage += val;
+        if (counter->usage > counter->max_usage)
+                counter->max_usage = counter->usage;
        return 0;
 }
@@ -65,6 +68,8 @@ res_counter_member(struct res_counter *counter, int member)
        switch (member) {
        case RES_USAGE:
                return &counter->usage;
+        case RES_MAX_USAGE:
+                return &counter->max_usage;
        case RES_LIMIT:
                return &counter->limit;
        case RES_FAILCNT:
@@ -92,6 +97,11 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
                        pos, buf, s - buf);
 }
+u64 res_counter_read_u64(struct res_counter *counter, int member)
+{
+        return *res_counter_member(counter, member);
+}
 ssize_t res_counter_write(struct res_counter *counter, int member,
                const char __user *userbuf, size_t nbytes, loff_t *pos,
                int (*write_strategy)(char *st_buf, unsigned long long *val))
diff --git a/kernel/resource.c b/kernel/resource.c
index cee12cc47cab..74af2d7cb5a1 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -131,14 +131,8 @@ static const struct file_operations proc_iomem_operations = {
 static int __init ioresources_init(void)
 {
-        struct proc_dir_entry *entry;
+        proc_create("ioports", 0, NULL, &proc_ioports_operations);
+        proc_create("iomem", 0, NULL, &proc_iomem_operations);
-        entry = create_proc_entry("ioports", 0, NULL);
-        if (entry)
-                entry->proc_fops = &proc_ioports_operations;
-        entry = create_proc_entry("iomem", 0, NULL);
-        if (entry)
-                entry->proc_fops = &proc_iomem_operations;
        return 0;
 }
 __initcall(ioresources_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index 740fb409e5bb..e2f7f5acc807 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9057,13 +9057,13 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
                                u64 shareval)
 {
        return sched_group_set_shares(cgroup_tg(cgrp), shareval);
 }
-static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 {
        struct task_group *tg = cgroup_tg(cgrp);
@@ -9073,48 +9073,14 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
 #ifdef CONFIG_RT_GROUP_SCHED
 static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
-                                struct file *file,
+                                s64 val)
-                                const char __user *userbuf,
-                                size_t nbytes, loff_t *unused_ppos)
 {
-        char buffer[64];
+        return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
-        int retval = 0;
-        s64 val;
-        char *end;
-        if (!nbytes)
-                return -EINVAL;
-        if (nbytes >= sizeof(buffer))
-                return -E2BIG;
-        if (copy_from_user(buffer, userbuf, nbytes))
-                return -EFAULT;
-        buffer[nbytes] = 0;     /* nul-terminate */
-        /* strip newline if necessary */
-        if (nbytes && (buffer[nbytes-1] == '\n'))
-                buffer[nbytes-1] = 0;
-        val = simple_strtoll(buffer, &end, 0);
-        if (*end)
-                return -EINVAL;
-        /* Pass to subsystem */
-        retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
-        if (!retval)
-                retval = nbytes;
-        return retval;
 }
-static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
+static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
-                                   struct file *file,
-                                   char __user *buf, size_t nbytes,
-                                   loff_t *ppos)
 {
-        char tmp[64];
+        return sched_group_rt_runtime(cgroup_tg(cgrp));
-        long val = sched_group_rt_runtime(cgroup_tg(cgrp));
-        int len = sprintf(tmp, "%ld\n", val);
-        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
@@ -9133,20 +9099,20 @@ static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        {
                .name = "shares",
-                .read_uint = cpu_shares_read_uint,
+                .read_u64 = cpu_shares_read_u64,
-                .write_uint = cpu_shares_write_uint,
+                .write_u64 = cpu_shares_write_u64,
        },
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
        {
                .name = "rt_runtime_us",
-                .read = cpu_rt_runtime_read,
+                .read_s64 = cpu_rt_runtime_read,
-                .write = cpu_rt_runtime_write,
+                .write_s64 = cpu_rt_runtime_write,
        },
        {
                .name = "rt_period_us",
-                .read_uint = cpu_rt_period_read_uint,
+                .read_u64 = cpu_rt_period_read_uint,
-                .write_uint = cpu_rt_period_write_uint,
+                .write_u64 = cpu_rt_period_write_uint,
        },
 #endif
 };
@@ -9277,8 +9243,8 @@ out:
 static struct cftype files[] = {
        {
                .name = "usage",
-                .read_uint = cpuusage_read,
+                .read_u64 = cpuusage_read,
-                .write_uint = cpuusage_write,
+                .write_u64 = cpuusage_write,
        },
 };
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index f3f4af4b8b0f..8a9498e7c831 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -277,12 +277,9 @@ static int __init init_sched_debug_procfs(void)
 {
        struct proc_dir_entry *pe;
-        pe = create_proc_entry("sched_debug", 0644, NULL);
+        pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops);
        if (!pe)
                return -ENOMEM;
-        pe->proc_fops = &sched_debug_fops;
        return 0;
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index f2a451366953..e423d0d9e6ff 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1545,6 +1545,19 @@ out:
 *
 */
+static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
+                                     cputime_t *utimep, cputime_t *stimep)
+{
+        *utimep = cputime_add(*utimep, t->utime);
+        *stimep = cputime_add(*stimep, t->stime);
+        r->ru_nvcsw += t->nvcsw;
+        r->ru_nivcsw += t->nivcsw;
+        r->ru_minflt += t->min_flt;
+        r->ru_majflt += t->maj_flt;
+        r->ru_inblock += task_io_get_inblock(t);
+        r->ru_oublock += task_io_get_oublock(t);
+}
 static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 {
        struct task_struct *t;
@@ -1554,6 +1567,11 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        memset((char *) r, 0, sizeof *r);
        utime = stime = cputime_zero;
+        if (who == RUSAGE_THREAD) {
+                accumulate_thread_rusage(p, r, &utime, &stime);
+                goto out;
+        }
        rcu_read_lock();
        if (!lock_task_sighand(p, &flags)) {
                rcu_read_unlock();
@@ -1586,14 +1604,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                        r->ru_oublock += p->signal->oublock;
                        t = p;
                        do {
-                                utime = cputime_add(utime, t->utime);
+                                accumulate_thread_rusage(t, r, &utime, &stime);
-                                stime = cputime_add(stime, t->stime);
-                                r->ru_nvcsw += t->nvcsw;
-                                r->ru_nivcsw += t->nivcsw;
-                                r->ru_minflt += t->min_flt;
-                                r->ru_majflt += t->maj_flt;
-                                r->ru_inblock += task_io_get_inblock(t);
-                                r->ru_oublock += task_io_get_oublock(t);
                                t = next_thread(t);
                        } while (t != p);
                        break;
@@ -1605,6 +1616,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        unlock_task_sighand(p, &flags);
        rcu_read_unlock();
+out:
        cputime_to_timeval(utime, &r->ru_utime);
        cputime_to_timeval(stime, &r->ru_stime);
 }
@@ -1618,7 +1630,8 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
 {
-        if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN)
+        if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
+            who != RUSAGE_THREAD)
                return -EINVAL;
        return getrusage(current, who, ru);
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fd3364827ccf..d7ffdc59816a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -38,6 +38,7 @@
 #include <linux/writeback.h>
 #include <linux/hugetlb.h>
 #include <linux/initrd.h>
+#include <linux/key.h>
 #include <linux/times.h>
 #include <linux/limits.h>
 #include <linux/dcache.h>
@@ -144,12 +145,6 @@ extern int no_unaligned_warning;
 extern int max_lock_depth;
 #endif
-#ifdef CONFIG_SYSCTL_SYSCALL
-static int parse_table(int __user *, int, void __user *, size_t __user *,
-                void __user *, size_t, struct ctl_table *);
-#endif
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -809,6 +804,14 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dostring,
                .strategy       = &sysctl_string,
        },
+#ifdef CONFIG_KEYS
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "keys",
+                .mode           = 0555,
+                .child          = key_sysctls,
+        },
+#endif
 /*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1430,6 +1433,76 @@ void register_sysctl_root(struct ctl_table_root *root)
 }
 #ifdef CONFIG_SYSCTL_SYSCALL
+/* Perform the actual read/write of a sysctl table entry. */
+static int do_sysctl_strategy(struct ctl_table_root *root,
+                        struct ctl_table *table,
+                        int __user *name, int nlen,
+                        void __user *oldval, size_t __user *oldlenp,
+                        void __user *newval, size_t newlen)
+{
+        int op = 0, rc;
+        if (oldval)
+                op |= 004;
+        if (newval)
+                op |= 002;
+        if (sysctl_perm(root, table, op))
+                return -EPERM;
+        if (table->strategy) {
+                rc = table->strategy(table, name, nlen, oldval, oldlenp,
+                                     newval, newlen);
+                if (rc < 0)
+                        return rc;
+                if (rc > 0)
+                        return 0;
+        }
+        /* If there is no strategy routine, or if the strategy returns
+         * zero, proceed with automatic r/w */
+        if (table->data && table->maxlen) {
+                rc = sysctl_data(table, name, nlen, oldval, oldlenp,
+                                 newval, newlen);
+                if (rc < 0)
+                        return rc;
+        }
+        return 0;
+}
+static int parse_table(int __user *name, int nlen,
+                       void __user *oldval, size_t __user *oldlenp,
+                       void __user *newval, size_t newlen,
+                       struct ctl_table_root *root,
+                       struct ctl_table *table)
+{
+        int n;
+repeat:
+        if (!nlen)
+                return -ENOTDIR;
+        if (get_user(n, name))
+                return -EFAULT;
+        for ( ; table->ctl_name || table->procname; table++) {
+                if (!table->ctl_name)
+                        continue;
+                if (n == table->ctl_name) {
+                        int error;
+                        if (table->child) {
+                                if (sysctl_perm(root, table, 001))
+                                        return -EPERM;
+                                name++;
+                                nlen--;
+                                table = table->child;
+                                goto repeat;
+                        }
+                        error = do_sysctl_strategy(root, table, name, nlen,
+                                                   oldval, oldlenp,
+                                                   newval, newlen);
+                        return error;
+                }
+        }
+        return -ENOTDIR;
+}
 int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
               void __user *newval, size_t newlen)
 {
@@ -1447,7 +1520,8 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
        for (head = sysctl_head_next(NULL); head;
                        head = sysctl_head_next(head)) {
                error = parse_table(name, nlen, oldval, oldlenp, 
-                                        newval, newlen, head->ctl_table);
+                                        newval, newlen,
+                                        head->root, head->ctl_table);
                if (error != -ENOTDIR) {
                        sysctl_head_finish(head);
                        break;
@@ -1493,84 +1567,22 @@ static int test_perm(int mode, int op)
        return -EACCES;
 }
-int sysctl_perm(struct ctl_table *table, int op)
+int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 {
        int error;
+        int mode;
        error = security_sysctl(table, op);
        if (error)
                return error;
-        return test_perm(table->mode, op);
-}
-#ifdef CONFIG_SYSCTL_SYSCALL
-static int parse_table(int __user *name, int nlen,
-                       void __user *oldval, size_t __user *oldlenp,
-                       void __user *newval, size_t newlen,
-                       struct ctl_table *table)
-{
-        int n;
-repeat:
-        if (!nlen)
-                return -ENOTDIR;
-        if (get_user(n, name))
-                return -EFAULT;
-        for ( ; table->ctl_name || table->procname; table++) {
-                if (!table->ctl_name)
-                        continue;
-                if (n == table->ctl_name) {
-                        int error;
-                        if (table->child) {
-                                if (sysctl_perm(table, 001))
-                                        return -EPERM;
-                                name++;
-                                nlen--;
-                                table = table->child;
-                                goto repeat;
-                        }
-                        error = do_sysctl_strategy(table, name, nlen,
-                                                   oldval, oldlenp,
-                                                   newval, newlen);
-                        return error;
-                }
-        }
-        return -ENOTDIR;
-}
-/* Perform the actual read/write of a sysctl table entry. */
+        if (root->permissions)
-int do_sysctl_strategy (struct ctl_table *table,
+                mode = root->permissions(root, current->nsproxy, table);
-                        int __user *name, int nlen,
+        else
-                        void __user *oldval, size_t __user *oldlenp,
+                mode = table->mode;
-                        void __user *newval, size_t newlen)
-{
-        int op = 0, rc;
-        if (oldval)
-                op |= 004;
-        if (newval) 
-                op |= 002;
-        if (sysctl_perm(table, op))
-                return -EPERM;
-        if (table->strategy) {
+        return test_perm(mode, op);
-                rc = table->strategy(table, name, nlen, oldval, oldlenp,
-                                     newval, newlen);
-                if (rc < 0)
-                        return rc;
-                if (rc > 0)
-                        return 0;
-        }
-        /* If there is no strategy routine, or if the strategy returns
-         * zero, proceed with automatic r/w */
-        if (table->data && table->maxlen) {
-                rc = sysctl_data(table, name, nlen, oldval, oldlenp,
-                                 newval, newlen);
-                if (rc < 0)
-                        return rc;
-        }
-        return 0;
 }
-#endif /* CONFIG_SYSCTL_SYSCALL */
 static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 {
@@ -1583,9 +1595,13 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 static __init int sysctl_init(void)
 {
-        int err;
        sysctl_set_parent(NULL, root_table);
-        err = sysctl_check_table(current->nsproxy, root_table);
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
+        {
+                int err;
+                err = sysctl_check_table(current->nsproxy, root_table);
+        }
+#endif
        return 0;
 }
@@ -1712,10 +1728,12 @@ struct ctl_table_header *__register_sysctl_paths(
        header->unregistering = NULL;
        header->root = root;
        sysctl_set_parent(NULL, header->ctl_table);
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
        if (sysctl_check_table(namespaces, header->ctl_table)) {
                kfree(header);
                return NULL;
        }
+#endif
        spin_lock(&sysctl_lock);
        header_list = lookup_header_list(root, namespaces);
        list_add_tail(&header->ctl_entry, header_list);
diff --git a/kernel/time.c b/kernel/time.c
index 35d373a98782..86729042e4cd 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,6 +35,7 @@
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 67fe8fc21fb1..a40e20fd0001 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -278,12 +278,9 @@ static int __init init_timer_list_procfs(void)
 {
        struct proc_dir_entry *pe;
-        pe = create_proc_entry("timer_list", 0644, NULL);
+        pe = proc_create("timer_list", 0644, NULL, &timer_list_fops);
        if (!pe)
                return -ENOMEM;
-        pe->proc_fops = &timer_list_fops;
        return 0;
 }
 __initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 417da8c5bc72..c994530d166d 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -415,12 +415,9 @@ static int __init init_tstats_procfs(void)
 {
        struct proc_dir_entry *pe;
-        pe = create_proc_entry("timer_stats", 0644, NULL);
+        pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
        if (!pe)
                return -ENOMEM;
-        pe->proc_fops = &tstats_fops;
        return 0;
 }
 __initcall(init_tstats_procfs);
diff --git a/kernel/user.c b/kernel/user.c
index debce602bfdd..aefbbfa3159f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -53,10 +53,6 @@ struct user_struct root_user = {
        .files          = ATOMIC_INIT(0),
        .sigpending     = ATOMIC_INIT(0),
        .locked_shm     = 0,
-#ifdef CONFIG_KEYS
-        .uid_keyring    = &root_user_keyring,
-        .session_keyring = &root_session_keyring,
-#endif
 #ifdef CONFIG_USER_SCHED
        .tg             = &init_task_group,
 #endif
@@ -420,12 +416,12 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
                new->mq_bytes = 0;
 #endif
                new->locked_shm = 0;
+#ifdef CONFIG_KEYS
-                if (alloc_uid_keyring(new, current) < 0)
+                new->uid_keyring = new->session_keyring = NULL;
-                        goto out_free_user;
+#endif
                if (sched_create_user(new) < 0)
-                        goto out_put_keys;
+                        goto out_free_user;
                if (uids_user_create(new))
                        goto out_destoy_sched;
@@ -459,9 +455,6 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 out_destoy_sched:
        sched_destroy_user(new);
-out_put_keys:
-        key_put(new->uid_keyring);
-        key_put(new->session_keyring);
 out_free_user:
        kmem_cache_free(uid_cachep, new);
 out_unlock:
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4c9006275df7..a9ab0596de44 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/version.h>
 #include <linux/nsproxy.h>
+#include <linux/slab.h>
 #include <linux/user_namespace.h>
 /*
@@ -73,3 +74,4 @@ void free_user_ns(struct kref *kref)
        release_uids(ns);
        kfree(ns);
 }
+EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 816d7b24fa03..64d398f12444 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/version.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 /*
 * Clone a new ns copying an original utsname, setting refcount to 1
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 00ff4d08e370..7db251a959c5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -158,8 +158,8 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 *
 * Returns 0 if @work was already on a queue, non-zero otherwise.
 *
- * We queue the work to the CPU it was submitted, but there is no
+ * We queue the work to the CPU on which it was submitted, but if the CPU dies
- * guarantee that it will be processed by that CPU.
+ * it can be processed by another CPU.
 */
 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
@@ -772,7 +772,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
-static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
        /*
         * Our caller is either destroy_workqueue() or CPU_DEAD,
@@ -808,19 +808,16 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 void destroy_workqueue(struct workqueue_struct *wq)
 {
        const cpumask_t *cpu_map = wq_cpu_map(wq);
-        struct cpu_workqueue_struct *cwq;
        int cpu;
        get_online_cpus();
        spin_lock(&workqueue_lock);
        list_del(&wq->list);
        spin_unlock(&workqueue_lock);
-        put_online_cpus();
-        for_each_cpu_mask(cpu, *cpu_map) {
+        for_each_cpu_mask(cpu, *cpu_map)
-                cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+                cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
-                cleanup_workqueue_thread(cwq, cpu);
+        put_online_cpus();
-        }
        free_percpu(wq->cpu_wq);
        kfree(wq);
@@ -838,7 +835,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        action &= ~CPU_TASKS_FROZEN;
        switch (action) {
        case CPU_UP_PREPARE:
                cpu_set(cpu, cpu_populated_map);
        }
@@ -861,11 +857,17 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                case CPU_UP_CANCELED:
                        start_workqueue_thread(cwq, -1);
                case CPU_DEAD:
-                        cleanup_workqueue_thread(cwq, cpu);
+                        cleanup_workqueue_thread(cwq);
                        break;
                }
        }
+        switch (action) {
+        case CPU_UP_CANCELED:
+        case CPU_DEAD:
+                cpu_clear(cpu, cpu_populated_map);
+        }
        return NOTIFY_OK;
 }