32 files changed, 871 insertions, 692 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 15ab63ffe64d..54f69837d35a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #
-obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
+obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            cpu.o exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o \
@@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
+obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
diff --git a/kernel/acct.c b/kernel/acct.c
index 91e1cfd734d2..dd68b9059418 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,37 +75,39 @@ int acct_parm[3] = {4, 2, 30};
 /*
 * External references and all of the globals.
 */
-static void do_acct_process(struct pid_namespace *ns, struct file *);
+static void do_acct_process(struct bsd_acct_struct *acct,
+                struct pid_namespace *ns, struct file *);
 /*
 * This structure is used so that all the data protected by lock
 * can be placed in the same cache line as the lock.  This primes
 * the cache line to have the data after getting the lock.
 */
-struct acct_glbs {
+struct bsd_acct_struct {
-        spinlock_t              lock;
        volatile int            active;
        volatile int            needcheck;
        struct file             *file;
        struct pid_namespace    *ns;
        struct timer_list       timer;
+        struct list_head        list;
 };
-static struct acct_glbs acct_globals __cacheline_aligned =
+static DEFINE_SPINLOCK(acct_lock);
-        {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
+static LIST_HEAD(acct_list);
 /*
 * Called whenever the timer says to check the free space.
 */
-static void acct_timeout(unsigned long unused)
+static void acct_timeout(unsigned long x)
 {
-        acct_globals.needcheck = 1;
+        struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
+        acct->needcheck = 1;
 }
 /*
 * Check the amount of free space and suspend/resume accordingly.
 */
-static int check_free_space(struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
 {
        struct kstatfs sbuf;
        int res;
@@ -113,11 +115,11 @@ static int check_free_space(struct file *file)
        sector_t resume;
        sector_t suspend;
-        spin_lock(&acct_globals.lock);
+        spin_lock(&acct_lock);
-        res = acct_globals.active;
+        res = acct->active;
-        if (!file || !acct_globals.needcheck)
+        if (!file || !acct->needcheck)
                goto out;
-        spin_unlock(&acct_globals.lock);
+        spin_unlock(&acct_lock);
        /* May block */
        if (vfs_statfs(file->f_path.dentry, &sbuf))
@@ -136,35 +138,35 @@ static int check_free_space(struct file *file)
                act = 0;
        /*
-         * If some joker switched acct_globals.file under us we'ld better be
+         * If some joker switched acct->file under us we'ld better be
         * silent and _not_ touch anything.
         */
-        spin_lock(&acct_globals.lock);
+        spin_lock(&acct_lock);
-        if (file != acct_globals.file) {
+        if (file != acct->file) {
                if (act)
                        res = act>0;
                goto out;
        }
-        if (acct_globals.active) {
+        if (acct->active) {
                if (act < 0) {
-                        acct_globals.active = 0;
+                        acct->active = 0;
                        printk(KERN_INFO "Process accounting paused\n");
                }
        } else {
                if (act > 0) {
-                        acct_globals.active = 1;
+                        acct->active = 1;
                        printk(KERN_INFO "Process accounting resumed\n");
                }
        }
-        del_timer(&acct_globals.timer);
+        del_timer(&acct->timer);
-        acct_globals.needcheck = 0;
+        acct->needcheck = 0;
-        acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+        acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-        add_timer(&acct_globals.timer);
+        add_timer(&acct->timer);
-        res = acct_globals.active;
+        res = acct->active;
 out:
-        spin_unlock(&acct_globals.lock);
+        spin_unlock(&acct_lock);
        return res;
 }
@@ -172,39 +174,41 @@ out:
 * Close the old accounting file (if currently open) and then replace
 * it with file (if non-NULL).
 *
- * NOTE: acct_globals.lock MUST be held on entry and exit.
+ * NOTE: acct_lock MUST be held on entry and exit.
 */
-static void acct_file_reopen(struct file *file)
+static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
+                struct pid_namespace *ns)
 {
        struct file *old_acct = NULL;
        struct pid_namespace *old_ns = NULL;
-        if (acct_globals.file) {
+        if (acct->file) {
-                old_acct = acct_globals.file;
+                old_acct = acct->file;
-                old_ns = acct_globals.ns;
+                old_ns = acct->ns;
-                del_timer(&acct_globals.timer);
+                del_timer(&acct->timer);
-                acct_globals.active = 0;
+                acct->active = 0;
-                acct_globals.needcheck = 0;
+                acct->needcheck = 0;
-                acct_globals.file = NULL;
+                acct->file = NULL;
+                acct->ns = NULL;
+                list_del(&acct->list);
        }
        if (file) {
-                acct_globals.file = file;
+                acct->file = file;
-                acct_globals.ns = get_pid_ns(task_active_pid_ns(current));
+                acct->ns = ns;
-                acct_globals.needcheck = 0;
+                acct->needcheck = 0;
-                acct_globals.active = 1;
+                acct->active = 1;
+                list_add(&acct->list, &acct_list);
                /* It's been deleted if it was used before so this is safe */
-                init_timer(&acct_globals.timer);
+                setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
-                acct_globals.timer.function = acct_timeout;
+                acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-                acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+                add_timer(&acct->timer);
-                add_timer(&acct_globals.timer);
        }
        if (old_acct) {
                mnt_unpin(old_acct->f_path.mnt);
-                spin_unlock(&acct_globals.lock);
+                spin_unlock(&acct_lock);
-                do_acct_process(old_ns, old_acct);
+                do_acct_process(acct, old_ns, old_acct);
                filp_close(old_acct, NULL);
-                put_pid_ns(old_ns);
+                spin_lock(&acct_lock);
-                spin_lock(&acct_globals.lock);
        }
 }
@@ -212,6 +216,8 @@ static int acct_on(char *name)
 {
        struct file *file;
        int error;
+        struct pid_namespace *ns;
+        struct bsd_acct_struct *acct = NULL;
        /* Difference from BSD - they don't do O_APPEND */
        file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
@@ -228,18 +234,34 @@ static int acct_on(char *name)
                return -EIO;
        }
+        ns = task_active_pid_ns(current);
+        if (ns->bacct == NULL) {
+                acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+                if (acct == NULL) {
+                        filp_close(file, NULL);
+                        return -ENOMEM;
+                }
+        }
        error = security_acct(file);
        if (error) {
+                kfree(acct);
                filp_close(file, NULL);
                return error;
        }
-        spin_lock(&acct_globals.lock);
+        spin_lock(&acct_lock);
+        if (ns->bacct == NULL) {
+                ns->bacct = acct;
+                acct = NULL;
+        }
        mnt_pin(file->f_path.mnt);
-        acct_file_reopen(file);
+        acct_file_reopen(ns->bacct, file, ns);
-        spin_unlock(&acct_globals.lock);
+        spin_unlock(&acct_lock);
        mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+        kfree(acct);
        return 0;
 }
@@ -269,11 +291,17 @@ asmlinkage long sys_acct(const char __user *name)
                error = acct_on(tmp);
                putname(tmp);
        } else {
+                struct bsd_acct_struct *acct;
+                acct = task_active_pid_ns(current)->bacct;
+                if (acct == NULL)
+                        return 0;
                error = security_acct(NULL);
                if (!error) {
-                        spin_lock(&acct_globals.lock);
+                        spin_lock(&acct_lock);
-                        acct_file_reopen(NULL);
+                        acct_file_reopen(acct, NULL, NULL);
-                        spin_unlock(&acct_globals.lock);
+                        spin_unlock(&acct_lock);
                }
        }
        return error;
@@ -288,10 +316,16 @@ asmlinkage long sys_acct(const char __user *name)
 */
 void acct_auto_close_mnt(struct vfsmount *m)
 {
-        spin_lock(&acct_globals.lock);
+        struct bsd_acct_struct *acct;
-        if (acct_globals.file && acct_globals.file->f_path.mnt == m)
-                acct_file_reopen(NULL);
+        spin_lock(&acct_lock);
-        spin_unlock(&acct_globals.lock);
+restart:
+        list_for_each_entry(acct, &acct_list, list)
+                if (acct->file && acct->file->f_path.mnt == m) {
+                        acct_file_reopen(acct, NULL, NULL);
+                        goto restart;
+                }
+        spin_unlock(&acct_lock);
 }
 /**
@@ -303,12 +337,31 @@ void acct_auto_close_mnt(struct vfsmount *m)
 */
 void acct_auto_close(struct super_block *sb)
 {
-        spin_lock(&acct_globals.lock);
+        struct bsd_acct_struct *acct;
-        if (acct_globals.file &&
-            acct_globals.file->f_path.mnt->mnt_sb == sb) {
+        spin_lock(&acct_lock);
-                acct_file_reopen(NULL);
+restart:
+        list_for_each_entry(acct, &acct_list, list)
+                if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+                        acct_file_reopen(acct, NULL, NULL);
+                        goto restart;
+                }
+        spin_unlock(&acct_lock);
+}
+void acct_exit_ns(struct pid_namespace *ns)
+{
+        struct bsd_acct_struct *acct;
+        spin_lock(&acct_lock);
+        acct = ns->bacct;
+        if (acct != NULL) {
+                if (acct->file != NULL)
+                        acct_file_reopen(acct, NULL, NULL);
+                kfree(acct);
        }
-        spin_unlock(&acct_globals.lock);
+        spin_unlock(&acct_lock);
 }
 /*
@@ -425,7 +478,8 @@ static u32 encode_float(u64 value)
 /*
 *  do_acct_process does all actual work. Caller holds the reference to file.
 */
-static void do_acct_process(struct pid_namespace *ns, struct file *file)
+static void do_acct_process(struct bsd_acct_struct *acct,
+                struct pid_namespace *ns, struct file *file)
 {
        struct pacct_struct *pacct = &current->signal->pacct;
        acct_t ac;
@@ -440,7 +494,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *file)
         * First check to see if there is enough free_space to continue
         * the process accounting system.
         */
-        if (!check_free_space(file))
+        if (!check_free_space(acct, file))
                return;
        /*
@@ -577,34 +631,46 @@ void acct_collect(long exitcode, int group_dead)
        spin_unlock_irq(&current->sighand->siglock);
 }
-/**
+static void acct_process_in_ns(struct pid_namespace *ns)
- * acct_process - now just a wrapper around do_acct_process
- * @exitcode: task exit code
- *
- * handles process accounting for an exiting task
- */
-void acct_process(void)
 {
        struct file *file = NULL;
-        struct pid_namespace *ns;
+        struct bsd_acct_struct *acct;
+        acct = ns->bacct;
        /*
         * accelerate the common fastpath:
         */
-        if (!acct_globals.file)
+        if (!acct || !acct->file)
                return;
-        spin_lock(&acct_globals.lock);
+        spin_lock(&acct_lock);
-        file = acct_globals.file;
+        file = acct->file;
        if (unlikely(!file)) {
-                spin_unlock(&acct_globals.lock);
+                spin_unlock(&acct_lock);
                return;
        }
        get_file(file);
-        ns = get_pid_ns(acct_globals.ns);
+        spin_unlock(&acct_lock);
-        spin_unlock(&acct_globals.lock);
-        do_acct_process(ns, file);
+        do_acct_process(acct, ns, file);
        fput(file);
-        put_pid_ns(ns);
+}
+/**
+ * acct_process - now just a wrapper around acct_process_in_ns,
+ * which in turn is a wrapper around do_acct_process.
+ *
+ * handles process accounting for an exiting task
+ */
+void acct_process(void)
+{
+        struct pid_namespace *ns;
+        /*
+         * This loop is safe lockless, since current is still
+         * alive and holds its namespace, which in turn holds
+         * its parent.
+         */
+        for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
+                acct_process_in_ns(ns);
 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 15ac0e1e4f4d..66ec9fd21e0c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -89,11 +89,7 @@ struct cgroupfs_root {
        /* Hierarchy-specific flags */
        unsigned long flags;
-        /* The path to use for release notifications. No locking
+        /* The path to use for release notifications. */
-         * between setting and use - so if userspace updates this
-         * while child cgroups exist, you could miss a
-         * notification. We ensure that it's always a valid
-         * NUL-terminated string */
        char release_agent_path[PATH_MAX];
 };
@@ -118,7 +114,7 @@ static int root_count;
 * extra work in the fork/exit path if none of the subsystems need to
 * be called.
 */
-static int need_forkexit_callback;
+static int need_forkexit_callback __read_mostly;
 static int need_mm_owner_callback __read_mostly;
 /* convenient tests for these bits */
@@ -220,7 +216,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
 * task until after the first call to cgroup_iter_start(). This
 * reduces the fork()/exit() overhead for people who have cgroups
 * compiled into their kernel but not actually in use */
-static int use_task_css_set_links;
+static int use_task_css_set_links __read_mostly;
 /* When we create or destroy a css_set, the operation simply
 * takes/releases a reference count on all the cgroups referenced
@@ -241,17 +237,20 @@ static int use_task_css_set_links;
 */
 static void unlink_css_set(struct css_set *cg)
 {
+        struct cg_cgroup_link *link;
+        struct cg_cgroup_link *saved_link;
        write_lock(&css_set_lock);
        hlist_del(&cg->hlist);
        css_set_count--;
-        while (!list_empty(&cg->cg_links)) {
-                struct cg_cgroup_link *link;
+        list_for_each_entry_safe(link, saved_link, &cg->cg_links,
-                link = list_entry(cg->cg_links.next,
+                                 cg_link_list) {
-                                  struct cg_cgroup_link, cg_link_list);
                list_del(&link->cg_link_list);
                list_del(&link->cgrp_link_list);
                kfree(link);
        }
        write_unlock(&css_set_lock);
 }
@@ -363,15 +362,14 @@ static struct css_set *find_existing_css_set(
 static int allocate_cg_links(int count, struct list_head *tmp)
 {
        struct cg_cgroup_link *link;
+        struct cg_cgroup_link *saved_link;
        int i;
        INIT_LIST_HEAD(tmp);
        for (i = 0; i < count; i++) {
                link = kmalloc(sizeof(*link), GFP_KERNEL);
                if (!link) {
-                        while (!list_empty(tmp)) {
+                        list_for_each_entry_safe(link, saved_link, tmp,
-                                link = list_entry(tmp->next,
+                                                 cgrp_link_list) {
-                                                  struct cg_cgroup_link,
-                                                  cgrp_link_list);
                                list_del(&link->cgrp_link_list);
                                kfree(link);
                        }
@@ -384,11 +382,10 @@ static int allocate_cg_links(int count, struct list_head *tmp)
 static void free_cg_links(struct list_head *tmp)
 {
-        while (!list_empty(tmp)) {
+        struct cg_cgroup_link *link;
-                struct cg_cgroup_link *link;
+        struct cg_cgroup_link *saved_link;
-                link = list_entry(tmp->next,
-                                  struct cg_cgroup_link,
+        list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
-                                  cgrp_link_list);
                list_del(&link->cgrp_link_list);
                kfree(link);
        }
@@ -415,11 +412,11 @@ static struct css_set *find_css_set(
        /* First see if we already have a cgroup group that matches
         * the desired set */
-        write_lock(&css_set_lock);
+        read_lock(&css_set_lock);
        res = find_existing_css_set(oldcg, cgrp, template);
        if (res)
                get_css_set(res);
-        write_unlock(&css_set_lock);
+        read_unlock(&css_set_lock);
        if (res)
                return res;
@@ -507,10 +504,6 @@ static struct css_set *find_css_set(
 * knows that the cgroup won't be removed, as cgroup_rmdir()
 * needs that mutex.
 *
- * The cgroup_common_file_write handler for operations that modify
- * the cgroup hierarchy holds cgroup_mutex across the entire operation,
- * single threading all such cgroup modifications across the system.
- *
 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
 * (usually) take cgroup_mutex.  These are the two most performance
 * critical pieces of code here.  The exception occurs on cgroup_exit(),
@@ -1093,6 +1086,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
        struct cgroupfs_root *root = sb->s_fs_info;
        struct cgroup *cgrp = &root->top_cgroup;
        int ret;
+        struct cg_cgroup_link *link;
+        struct cg_cgroup_link *saved_link;
        BUG_ON(!root);
@@ -1112,10 +1107,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
         * root cgroup
         */
        write_lock(&css_set_lock);
-        while (!list_empty(&cgrp->css_sets)) {
-                struct cg_cgroup_link *link;
+        list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
-                link = list_entry(cgrp->css_sets.next,
+                                 cgrp_link_list) {
-                                  struct cg_cgroup_link, cgrp_link_list);
                list_del(&link->cg_link_list);
                list_del(&link->cgrp_link_list);
                kfree(link);
@@ -1281,18 +1275,14 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 }
 /*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with
+ * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * cgroup_mutex, may take task_lock of task
+ * held. May take task_lock of task
 */
-static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 {
-        pid_t pid;
        struct task_struct *tsk;
        int ret;
-        if (sscanf(pidbuf, "%d", &pid) != 1)
-                return -EIO;
        if (pid) {
                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
@@ -1318,6 +1308,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
        return ret;
 }
+static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
+{
+        int ret;
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
+        ret = attach_task_by_pid(cgrp, pid);
+        cgroup_unlock();
+        return ret;
+}
 /* The various types of files and directories in a cgroup file system */
 enum cgroup_filetype {
        FILE_ROOT,
@@ -1327,12 +1327,54 @@ enum cgroup_filetype {
        FILE_RELEASE_AGENT,
 };
+/**
+ * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
+ * @cgrp: the cgroup to be checked for liveness
+ *
+ * On success, returns true; the lock should be later released with
+ * cgroup_unlock(). On failure returns false with no lock held.
+ */
+bool cgroup_lock_live_group(struct cgroup *cgrp)
+{
+        mutex_lock(&cgroup_mutex);
+        if (cgroup_is_removed(cgrp)) {
+                mutex_unlock(&cgroup_mutex);
+                return false;
+        }
+        return true;
+}
+static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buffer)
+{
+        BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
+        strcpy(cgrp->root->release_agent_path, buffer);
+        cgroup_unlock();
+        return 0;
+}
+static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
+                                     struct seq_file *seq)
+{
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
+        seq_puts(seq, cgrp->root->release_agent_path);
+        seq_putc(seq, '\n');
+        cgroup_unlock();
+        return 0;
+}
+/* A buffer size big enough for numbers or short strings */
+#define CGROUP_LOCAL_BUFFER_SIZE 64
 static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
                                struct file *file,
                                const char __user *userbuf,
                                size_t nbytes, loff_t *unused_ppos)
 {
-        char buffer[64];
+        char buffer[CGROUP_LOCAL_BUFFER_SIZE];
        int retval = 0;
        char *end;
@@ -1361,68 +1403,36 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
        return retval;
 }
-static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
+static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
-                                           struct cftype *cft,
+                                   struct file *file,
-                                           struct file *file,
+                                   const char __user *userbuf,
-                                           const char __user *userbuf,
+                                   size_t nbytes, loff_t *unused_ppos)
-                                           size_t nbytes, loff_t *unused_ppos)
 {
-        enum cgroup_filetype type = cft->private;
+        char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
-        char *buffer;
        int retval = 0;
+        size_t max_bytes = cft->max_write_len;
+        char *buffer = local_buffer;
-        if (nbytes >= PATH_MAX)
+        if (!max_bytes)
+                max_bytes = sizeof(local_buffer) - 1;
+        if (nbytes >= max_bytes)
                return -E2BIG;
+        /* Allocate a dynamic buffer if we need one */
-        /* +1 for nul-terminator */
+        if (nbytes >= sizeof(local_buffer)) {
-        buffer = kmalloc(nbytes + 1, GFP_KERNEL);
+                buffer = kmalloc(nbytes + 1, GFP_KERNEL);
-        if (buffer == NULL)
+                if (buffer == NULL)
-                return -ENOMEM;
+                        return -ENOMEM;
-        if (copy_from_user(buffer, userbuf, nbytes)) {
-                retval = -EFAULT;
-                goto out1;
        }
-        buffer[nbytes] = 0;     /* nul-terminate */
+        if (nbytes && copy_from_user(buffer, userbuf, nbytes))
-        strstrip(buffer);       /* strip -just- trailing whitespace */
+                return -EFAULT;
-        mutex_lock(&cgroup_mutex);
-        /*
+        buffer[nbytes] = 0;     /* nul-terminate */
-         * This was already checked for in cgroup_file_write(), but
+        strstrip(buffer);
-         * check again now we're holding cgroup_mutex.
+        retval = cft->write_string(cgrp, cft, buffer);
-         */
+        if (!retval)
-        if (cgroup_is_removed(cgrp)) {
-                retval = -ENODEV;
-                goto out2;
-        }
-        switch (type) {
-        case FILE_TASKLIST:
-                retval = attach_task_by_pid(cgrp, buffer);
-                break;
-        case FILE_NOTIFY_ON_RELEASE:
-                clear_bit(CGRP_RELEASABLE, &cgrp->flags);
-                if (simple_strtoul(buffer, NULL, 10) != 0)
-                        set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-                else
-                        clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-                break;
-        case FILE_RELEASE_AGENT:
-                BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-                strcpy(cgrp->root->release_agent_path, buffer);
-                break;
-        default:
-                retval = -EINVAL;
-                goto out2;
-        }
-        if (retval == 0)
                retval = nbytes;
-out2:
+        if (buffer != local_buffer)
-        mutex_unlock(&cgroup_mutex);
+                kfree(buffer);
-out1:
-        kfree(buffer);
        return retval;
 }
@@ -1438,6 +1448,8 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
                return cft->write(cgrp, cft, file, buf, nbytes, ppos);
        if (cft->write_u64 || cft->write_s64)
                return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
+        if (cft->write_string)
+                return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
        if (cft->trigger) {
                int ret = cft->trigger(cgrp, (unsigned int)cft->private);
                return ret ? ret : nbytes;
@@ -1450,7 +1462,7 @@ static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
                               char __user *buf, size_t nbytes,
                               loff_t *ppos)
 {
-        char tmp[64];
+        char tmp[CGROUP_LOCAL_BUFFER_SIZE];
        u64 val = cft->read_u64(cgrp, cft);
        int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
@@ -1462,56 +1474,13 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
                               char __user *buf, size_t nbytes,
                               loff_t *ppos)
 {
-        char tmp[64];
+        char tmp[CGROUP_LOCAL_BUFFER_SIZE];
        s64 val = cft->read_s64(cgrp, cft);
        int len = sprintf(tmp, "%lld\n", (long long) val);
        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
-static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
-                                          struct cftype *cft,
-                                          struct file *file,
-                                          char __user *buf,
-                                          size_t nbytes, loff_t *ppos)
-{
-        enum cgroup_filetype type = cft->private;
-        char *page;
-        ssize_t retval = 0;
-        char *s;
-        if (!(page = (char *)__get_free_page(GFP_KERNEL)))
-                return -ENOMEM;
-        s = page;
-        switch (type) {
-        case FILE_RELEASE_AGENT:
-        {
-                struct cgroupfs_root *root;
-                size_t n;
-                mutex_lock(&cgroup_mutex);
-                root = cgrp->root;
-                n = strnlen(root->release_agent_path,
-                            sizeof(root->release_agent_path));
-                n = min(n, (size_t) PAGE_SIZE);
-                strncpy(s, root->release_agent_path, n);
-                mutex_unlock(&cgroup_mutex);
-                s += n;
-                break;
-        }
-        default:
-                retval = -EINVAL;
-                goto out;
-        }
-        *s++ = '\n';
-        retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
-out:
-        free_page((unsigned long)page);
-        return retval;
-}
 static ssize_t cgroup_file_read(struct file *file, char __user *buf,
                                   size_t nbytes, loff_t *ppos)
 {
@@ -1569,6 +1538,7 @@ int cgroup_seqfile_release(struct inode *inode, struct file *file)
 static struct file_operations cgroup_seqfile_operations = {
        .read = seq_read,
+        .write = cgroup_file_write,
        .llseek = seq_lseek,
        .release = cgroup_seqfile_release,
 };
@@ -1756,15 +1726,11 @@ int cgroup_add_files(struct cgroup *cgrp,
 int cgroup_task_count(const struct cgroup *cgrp)
 {
        int count = 0;
-        struct list_head *l;
+        struct cg_cgroup_link *link;
        read_lock(&css_set_lock);
-        l = cgrp->css_sets.next;
+        list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
-        while (l != &cgrp->css_sets) {
-                struct cg_cgroup_link *link =
-                        list_entry(l, struct cg_cgroup_link, cgrp_link_list);
                count += atomic_read(&link->cg->ref.refcount);
-                l = l->next;
        }
        read_unlock(&css_set_lock);
        return count;
@@ -2227,6 +2193,18 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
        return notify_on_release(cgrp);
 }
+static int cgroup_write_notify_on_release(struct cgroup *cgrp,
+                                          struct cftype *cft,
+                                          u64 val)
+{
+        clear_bit(CGRP_RELEASABLE, &cgrp->flags);
+        if (val)
+                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+        else
+                clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+        return 0;
+}
 /*
 * for the common functions, 'private' gives the type of file
 */
@@ -2235,7 +2213,7 @@ static struct cftype files[] = {
                .name = "tasks",
                .open = cgroup_tasks_open,
                .read = cgroup_tasks_read,
-                .write = cgroup_common_file_write,
+                .write_u64 = cgroup_tasks_write,
                .release = cgroup_tasks_release,
                .private = FILE_TASKLIST,
        },
@@ -2243,15 +2221,16 @@ static struct cftype files[] = {
        {
                .name = "notify_on_release",
                .read_u64 = cgroup_read_notify_on_release,
-                .write = cgroup_common_file_write,
+                .write_u64 = cgroup_write_notify_on_release,
                .private = FILE_NOTIFY_ON_RELEASE,
        },
 };
 static struct cftype cft_release_agent = {
        .name = "release_agent",
-        .read = cgroup_common_file_read,
+        .read_seq_string = cgroup_release_agent_show,
-        .write = cgroup_common_file_write,
+        .write_string = cgroup_release_agent_write,
+        .max_write_len = PATH_MAX,
        .private = FILE_RELEASE_AGENT,
 };
@@ -2869,16 +2848,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 * cgroup_clone - clone the cgroup the given subsystem is attached to
 * @tsk: the task to be moved
 * @subsys: the given subsystem
+ * @nodename: the name for the new cgroup
 *
 * Duplicate the current cgroup in the hierarchy that the given
 * subsystem is attached to, and move this task into the new
 * child.
 */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
+int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
+                                                        char *nodename)
 {
        struct dentry *dentry;
        int ret = 0;
-        char nodename[MAX_CGROUP_TYPE_NAMELEN];
        struct cgroup *parent, *child;
        struct inode *inode;
        struct css_set *cg;
@@ -2903,8 +2883,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
        cg = tsk->cgroups;
        parent = task_cgroup(tsk, subsys->subsys_id);
-        snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid);
        /* Pin the hierarchy */
        atomic_inc(&parent->root->sb->s_active);
@@ -3078,27 +3056,24 @@ static void cgroup_release_agent(struct work_struct *work)
        while (!list_empty(&release_list)) {
                char *argv[3], *envp[3];
                int i;
-                char *pathbuf;
+                char *pathbuf = NULL, *agentbuf = NULL;
                struct cgroup *cgrp = list_entry(release_list.next,
                                                    struct cgroup,
                                                    release_list);
                list_del_init(&cgrp->release_list);
                spin_unlock(&release_list_lock);
                pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-                if (!pathbuf) {
+                if (!pathbuf)
-                        spin_lock(&release_list_lock);
+                        goto continue_free;
-                        continue;
+                if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
-                }
+                        goto continue_free;
+                agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
-                if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {
+                if (!agentbuf)
-                        kfree(pathbuf);
+                        goto continue_free;
-                        spin_lock(&release_list_lock);
-                        continue;
-                }
                i = 0;
-                argv[i++] = cgrp->root->release_agent_path;
+                argv[i++] = agentbuf;
-                argv[i++] = (char *)pathbuf;
+                argv[i++] = pathbuf;
                argv[i] = NULL;
                i = 0;
@@ -3112,8 +3087,10 @@ static void cgroup_release_agent(struct work_struct *work)
                 * be a slow process */
                mutex_unlock(&cgroup_mutex);
                call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-                kfree(pathbuf);
                mutex_lock(&cgroup_mutex);
+ continue_free:
+                kfree(pathbuf);
+                kfree(agentbuf);
                spin_lock(&release_list_lock);
        }
        spin_unlock(&release_list_lock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2cc409ce0a8f..10ba5f1004a5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -285,6 +285,11 @@ out_allowed:
        set_cpus_allowed_ptr(current, &old_allowed);
 out_release:
        cpu_hotplug_done();
+        if (!err) {
+                if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
+                                            hcpu) == NOTIFY_BAD)
+                        BUG();
+        }
        return err;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5738910c34c..91cf85b36dd5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -227,10 +227,6 @@ static struct cpuset top_cpuset = {
 * The task_struct fields mems_allowed and mems_generation may only
 * be accessed in the context of that task, so require no locks.
 *
- * The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds cgroup_mutex across the entire operation,
- * single threading all such cpuset modifications across the system.
- *
 * The cpuset_common_file_read() handlers only hold callback_mutex across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
@@ -369,7 +365,7 @@ void cpuset_update_task_memory_state(void)
                my_cpusets_mem_gen = top_cpuset.mems_generation;
        } else {
                rcu_read_lock();
-                my_cpusets_mem_gen = task_cs(current)->mems_generation;
+                my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
                rcu_read_unlock();
        }
@@ -500,11 +496,16 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 /*
 * rebuild_sched_domains()
 *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * This routine will be called to rebuild the scheduler's dynamic
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * sched domains:
- * which has that flag enabled, or if any cpuset with a non-empty
+ * - if the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' is removed, then call this routine to rebuild the
+ *   'cpus' changes,
- * scheduler's dynamic sched domains.
+ * - or if the 'cpus' allowed changes in any cpuset which has that
+ *   flag enabled,
+ * - or if the 'sched_relax_domain_level' of any cpuset which has
+ *   that flag enabled and with non-empty 'cpus' changes,
+ * - or if any cpuset with non-empty 'cpus' is removed,
+ * - or if a cpu gets offlined.
 *
 * This routine builds a partial partition of the systems CPUs
 * (the set of non-overlappping cpumask_t's in the array 'part'
@@ -609,8 +610,13 @@ void rebuild_sched_domains(void)
        while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
                struct cgroup *cont;
                struct cpuset *child;   /* scans child cpusets of cp */
+                if (cpus_empty(cp->cpus_allowed))
+                        continue;
                if (is_sched_load_balance(cp))
                        csa[csn++] = cp;
                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
                        child = cgroup_cs(cont);
                        __kfifo_put(q, (void *)&child, sizeof(cp));
@@ -703,36 +709,6 @@ done:
        /* Don't kfree(dattr) -- partition_sched_domains() does that. */
 }
-static inline int started_after_time(struct task_struct *t1,
-                                     struct timespec *time,
-                                     struct task_struct *t2)
-{
-        int start_diff = timespec_compare(&t1->start_time, time);
-        if (start_diff > 0) {
-                return 1;
-        } else if (start_diff < 0) {
-                return 0;
-        } else {
-                /*
-                 * Arbitrarily, if two processes started at the same
-                 * time, we'll say that the lower pointer value
-                 * started first. Note that t2 may have exited by now
-                 * so this may not be a valid pointer any longer, but
-                 * that's fine - it still serves to distinguish
-                 * between two tasks started (effectively)
-                 * simultaneously.
-                 */
-                return t1 > t2;
-        }
-}
-static inline int started_after(void *p1, void *p2)
-{
-        struct task_struct *t1 = p1;
-        struct task_struct *t2 = p2;
-        return started_after_time(t1, &t2->start_time, t2);
-}
 /**
 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
 * @tsk: task to test
@@ -768,15 +744,49 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 }
 /**
+ * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ *
+ * Called with cgroup_mutex held
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ *
+ * Return 0 if successful, -errno if not.
+ */
+static int update_tasks_cpumask(struct cpuset *cs)
+{
+        struct cgroup_scanner scan;
+        struct ptr_heap heap;
+        int retval;
+        /*
+         * cgroup_scan_tasks() will initialize heap->gt for us.
+         * heap_init() is still needed here for we should not change
+         * cs->cpus_allowed when heap_init() fails.
+         */
+        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+        if (retval)
+                return retval;
+        scan.cg = cs->css.cgroup;
+        scan.test_task = cpuset_test_cpumask;
+        scan.process_task = cpuset_change_cpumask;
+        scan.heap = &heap;
+        retval = cgroup_scan_tasks(&scan);
+        heap_free(&heap);
+        return retval;
+}
+/**
 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
 * @cs: the cpuset to consider
 * @buf: buffer of cpu numbers written to this cpuset
 */
-static int update_cpumask(struct cpuset *cs, char *buf)
+static int update_cpumask(struct cpuset *cs, const char *buf)
 {
        struct cpuset trialcs;
-        struct cgroup_scanner scan;
-        struct ptr_heap heap;
        int retval;
        int is_load_balanced;
@@ -792,7 +802,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have cpus.
         */
-        buf = strstrip(buf);
        if (!*buf) {
                cpus_clear(trialcs.cpus_allowed);
        } else {
@@ -811,10 +820,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
        if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
                return 0;
-        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
-        if (retval)
-                return retval;
        is_load_balanced = is_sched_load_balance(&trialcs);
        mutex_lock(&callback_mutex);
@@ -825,12 +830,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
         * Scan tasks in the cpuset, and update the cpumasks of any
         * that need an update.
         */
-        scan.cg = cs->css.cgroup;
+        retval = update_tasks_cpumask(cs);
-        scan.test_task = cpuset_test_cpumask;
+        if (retval < 0)
-        scan.process_task = cpuset_change_cpumask;
+                return retval;
-        scan.heap = &heap;
-        cgroup_scan_tasks(&scan);
-        heap_free(&heap);
        if (is_load_balanced)
                rebuild_sched_domains();
@@ -886,74 +888,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
        mutex_unlock(&callback_mutex);
 }
-/*
- * Handle user request to change the 'mems' memory placement
- * of a cpuset.  Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies and if
- * the cpuset is marked 'memory_migrate', migrate the tasks
- * pages to the new memory.
- *
- * Call with cgroup_mutex held.  May take callback_mutex during call.
- * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
- * lock each such tasks mm->mmap_sem, scan its vma's and rebind
- * their mempolicies to the cpusets new mems_allowed.
- */
 static void *cpuset_being_rebound;
-static int update_nodemask(struct cpuset *cs, char *buf)
+/**
+ * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
+ * @oldmem: old mems_allowed of cpuset cs
+ *
+ * Called with cgroup_mutex held
+ * Return 0 if successful, -errno if not.
+ */
+static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 {
-        struct cpuset trialcs;
-        nodemask_t oldmem;
        struct task_struct *p;
        struct mm_struct **mmarray;
        int i, n, ntasks;
        int migrate;
        int fudge;
-        int retval;
        struct cgroup_iter it;
+        int retval;
-        /*
-         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
-         * it's read-only
-         */
-        if (cs == &top_cpuset)
-                return -EACCES;
-        trialcs = *cs;
-        /*
-         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
-         * Since nodelist_parse() fails on an empty mask, we special case
-         * that parsing.  The validate_change() call ensures that cpusets
-         * with tasks have memory.
-         */
-        buf = strstrip(buf);
-        if (!*buf) {
-                nodes_clear(trialcs.mems_allowed);
-        } else {
-                retval = nodelist_parse(buf, trialcs.mems_allowed);
-                if (retval < 0)
-                        goto done;
-                if (!nodes_subset(trialcs.mems_allowed,
-                                node_states[N_HIGH_MEMORY]))
-                        return -EINVAL;
-        }
-        oldmem = cs->mems_allowed;
-        if (nodes_equal(oldmem, trialcs.mems_allowed)) {
-                retval = 0;             /* Too easy - nothing to do */
-                goto done;
-        }
-        retval = validate_change(cs, &trialcs);
-        if (retval < 0)
-                goto done;
-        mutex_lock(&callback_mutex);
-        cs->mems_allowed = trialcs.mems_allowed;
-        cs->mems_generation = cpuset_mems_generation++;
-        mutex_unlock(&callback_mutex);
        cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
@@ -1020,7 +973,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                mpol_rebind_mm(mm, &cs->mems_allowed);
                if (migrate)
-                        cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
+                        cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
                mmput(mm);
        }
@@ -1032,6 +985,70 @@ done:
        return retval;
 }
+/*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset.  Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies and if
+ * the cpuset is marked 'memory_migrate', migrate the tasks
+ * pages to the new memory.
+ *
+ * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
+ */
+static int update_nodemask(struct cpuset *cs, const char *buf)
+{
+        struct cpuset trialcs;
+        nodemask_t oldmem;
+        int retval;
+        /*
+         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
+         * it's read-only
+         */
+        if (cs == &top_cpuset)
+                return -EACCES;
+        trialcs = *cs;
+        /*
+         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
+         * Since nodelist_parse() fails on an empty mask, we special case
+         * that parsing.  The validate_change() call ensures that cpusets
+         * with tasks have memory.
+         */
+        if (!*buf) {
+                nodes_clear(trialcs.mems_allowed);
+        } else {
+                retval = nodelist_parse(buf, trialcs.mems_allowed);
+                if (retval < 0)
+                        goto done;
+                if (!nodes_subset(trialcs.mems_allowed,
+                                node_states[N_HIGH_MEMORY]))
+                        return -EINVAL;
+        }
+        oldmem = cs->mems_allowed;
+        if (nodes_equal(oldmem, trialcs.mems_allowed)) {
+                retval = 0;             /* Too easy - nothing to do */
+                goto done;
+        }
+        retval = validate_change(cs, &trialcs);
+        if (retval < 0)
+                goto done;
+        mutex_lock(&callback_mutex);
+        cs->mems_allowed = trialcs.mems_allowed;
+        cs->mems_generation = cpuset_mems_generation++;
+        mutex_unlock(&callback_mutex);
+        retval = update_tasks_nodemask(cs, &oldmem);
+done:
+        return retval;
+}
 int current_cpuset_is_being_rebound(void)
 {
        return task_cs(current) == cpuset_being_rebound;
@@ -1044,7 +1061,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
-                rebuild_sched_domains();
+                if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
+                        rebuild_sched_domains();
        }
        return 0;
@@ -1256,72 +1274,14 @@ typedef enum {
        FILE_SPREAD_SLAB,
 } cpuset_filetype_t;
-static ssize_t cpuset_common_file_write(struct cgroup *cont,
-                                        struct cftype *cft,
-                                        struct file *file,
-                                        const char __user *userbuf,
-                                        size_t nbytes, loff_t *unused_ppos)
-{
-        struct cpuset *cs = cgroup_cs(cont);
-        cpuset_filetype_t type = cft->private;
-        char *buffer;
-        int retval = 0;
-        /* Crude upper limit on largest legitimate cpulist user might write. */
-        if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
-                return -E2BIG;
-        /* +1 for nul-terminator */
-        buffer = kmalloc(nbytes + 1, GFP_KERNEL);
-        if (!buffer)
-                return -ENOMEM;
-        if (copy_from_user(buffer, userbuf, nbytes)) {
-                retval = -EFAULT;
-                goto out1;
-        }
-        buffer[nbytes] = 0;     /* nul-terminate */
-        cgroup_lock();
-        if (cgroup_is_removed(cont)) {
-                retval = -ENODEV;
-                goto out2;
-        }
-        switch (type) {
-        case FILE_CPULIST:
-                retval = update_cpumask(cs, buffer);
-                break;
-        case FILE_MEMLIST:
-                retval = update_nodemask(cs, buffer);
-                break;
-        default:
-                retval = -EINVAL;
-                goto out2;
-        }
-        if (retval == 0)
-                retval = nbytes;
-out2:
-        cgroup_unlock();
-out1:
-        kfree(buffer);
-        return retval;
-}
 static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
        int retval = 0;
        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
-        cgroup_lock();
+        if (!cgroup_lock_live_group(cgrp))
-        if (cgroup_is_removed(cgrp)) {
-                cgroup_unlock();
                return -ENODEV;
-        }
        switch (type) {
        case FILE_CPU_EXCLUSIVE:
@@ -1367,12 +1327,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
-        cgroup_lock();
+        if (!cgroup_lock_live_group(cgrp))
-        if (cgroup_is_removed(cgrp)) {
-                cgroup_unlock();
                return -ENODEV;
-        }
        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                retval = update_relax_domain_level(cs, val);
@@ -1386,6 +1343,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 }
 /*
+ * Common handling for a write to a "cpus" or "mems" file.
+ */
+static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
+                                const char *buf)
+{
+        int retval = 0;
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
+        switch (cft->private) {
+        case FILE_CPULIST:
+                retval = update_cpumask(cgroup_cs(cgrp), buf);
+                break;
+        case FILE_MEMLIST:
+                retval = update_nodemask(cgroup_cs(cgrp), buf);
+                break;
+        default:
+                retval = -EINVAL;
+                break;
+        }
+        cgroup_unlock();
+        return retval;
+}
+/*
 * These ascii lists should be read in a single call, by using a user
 * buffer large enough to hold the entire map.  If read in smaller
 * chunks, there is no guarantee of atomicity.  Since the display format
@@ -1504,14 +1487,16 @@ static struct cftype files[] = {
        {
                .name = "cpus",
                .read = cpuset_common_file_read,
-                .write = cpuset_common_file_write,
+                .write_string = cpuset_write_resmask,
+                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
        },
        {
                .name = "mems",
                .read = cpuset_common_file_read,
-                .write = cpuset_common_file_write,
+                .write_string = cpuset_write_resmask,
+                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
        },
@@ -1792,7 +1777,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
        scan.scan.heap = NULL;
        scan.to = to->css.cgroup;
-        if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
+        if (cgroup_scan_tasks(&scan.scan))
                printk(KERN_ERR "move_member_tasks_to_cpuset: "
                                "cgroup_scan_tasks failed\n");
 }
@@ -1852,6 +1837,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
        struct cpuset *child;   /* scans child cpusets of cp */
        struct list_head queue;
        struct cgroup *cont;
+        nodemask_t oldmems;
        INIT_LIST_HEAD(&queue);
@@ -1871,6 +1857,8 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
+                oldmems = cp->mems_allowed;
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
                cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
@@ -1882,6 +1870,10 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
                if (cpus_empty(cp->cpus_allowed) ||
                     nodes_empty(cp->mems_allowed))
                        remove_tasks_in_empty_cpuset(cp);
+                else {
+                        update_tasks_cpumask(cp);
+                        update_tasks_nodemask(cp, &oldmems);
+                }
        }
 }
@@ -1974,7 +1966,6 @@ void __init cpuset_init_smp(void)
 }
 /**
 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 10e43fd8b721..b3179dad71be 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -145,8 +145,11 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
        d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
        tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
        d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+        tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
+        d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
        d->blkio_count += tsk->delays->blkio_count;
        d->swapin_count += tsk->delays->swapin_count;
+        d->freepages_count += tsk->delays->freepages_count;
        spin_unlock_irqrestore(&tsk->delays->lock, flags);
 done:
@@ -165,3 +168,16 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
        return ret;
 }
+void __delayacct_freepages_start(void)
+{
+        delayacct_start(&current->delays->freepages_start);
+}
+void __delayacct_freepages_end(void)
+{
+        delayacct_end(&current->delays->freepages_start,
+                        &current->delays->freepages_end,
+                        &current->delays->freepages_delay,
+                        &current->delays->freepages_count);
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index 93d2711b9381..ad933bb29ec7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
        BUG_ON(!sig);
        BUG_ON(!atomic_read(&sig->count));
-        rcu_read_lock();
        sighand = rcu_dereference(tsk->sighand);
        spin_lock(&sighand->siglock);
@@ -121,6 +120,18 @@ static void __exit_signal(struct task_struct *tsk)
                sig->nivcsw += tsk->nivcsw;
                sig->inblock += task_io_get_inblock(tsk);
                sig->oublock += task_io_get_oublock(tsk);
+#ifdef CONFIG_TASK_XACCT
+                sig->rchar += tsk->rchar;
+                sig->wchar += tsk->wchar;
+                sig->syscr += tsk->syscr;
+                sig->syscw += tsk->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+                sig->ioac.read_bytes += tsk->ioac.read_bytes;
+                sig->ioac.write_bytes += tsk->ioac.write_bytes;
+                sig->ioac.cancelled_write_bytes +=
+                                        tsk->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
                sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
                sig = NULL; /* Marker for below. */
        }
@@ -136,7 +147,6 @@ static void __exit_signal(struct task_struct *tsk)
        tsk->signal = NULL;
        tsk->sighand = NULL;
        spin_unlock(&sighand->siglock);
-        rcu_read_unlock();
        __cleanup_sighand(sighand);
        clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
@@ -432,7 +442,7 @@ void daemonize(const char *name, ...)
         * We don't want to have TIF_FREEZE set if the system-wide hibernation
         * or suspend transition begins right now.
         */
-        current->flags |= PF_NOFREEZE;
+        current->flags |= (PF_NOFREEZE | PF_KTHREAD);
        if (current->nsproxy != &init_nsproxy) {
                get_nsproxy(&init_nsproxy);
@@ -666,26 +676,40 @@ assign_new_owner:
 static void exit_mm(struct task_struct * tsk)
 {
        struct mm_struct *mm = tsk->mm;
+        struct core_state *core_state;
        mm_release(tsk, mm);
        if (!mm)
                return;
        /*
         * Serialize with any possible pending coredump.
-         * We must hold mmap_sem around checking core_waiters
+         * We must hold mmap_sem around checking core_state
         * and clearing tsk->mm.  The core-inducing thread
-         * will increment core_waiters for each thread in the
+         * will increment ->nr_threads for each thread in the
         * group with ->mm != NULL.
         */
        down_read(&mm->mmap_sem);
-        if (mm->core_waiters) {
+        core_state = mm->core_state;
+        if (core_state) {
+                struct core_thread self;
                up_read(&mm->mmap_sem);
-                down_write(&mm->mmap_sem);
-                if (!--mm->core_waiters)
-                        complete(mm->core_startup_done);
-                up_write(&mm->mmap_sem);
-                wait_for_completion(&mm->core_done);
+                self.task = tsk;
+                self.next = xchg(&core_state->dumper.next, &self);
+                /*
+                 * Implies mb(), the result of xchg() must be visible
+                 * to core_state->dumper.
+                 */
+                if (atomic_dec_and_test(&core_state->nr_threads))
+                        complete(&core_state->startup);
+                for (;;) {
+                        set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+                        if (!self.task) /* see coredump_finish() */
+                                break;
+                        schedule();
+                }
+                __set_task_state(tsk, TASK_RUNNING);
                down_read(&mm->mmap_sem);
        }
        atomic_inc(&mm->mm_count);
@@ -1354,6 +1378,21 @@ static int wait_task_zombie(struct task_struct *p, int options,
                psig->coublock +=
                        task_io_get_oublock(p) +
                        sig->oublock + sig->coublock;
+#ifdef CONFIG_TASK_XACCT
+                psig->rchar += p->rchar + sig->rchar;
+                psig->wchar += p->wchar + sig->wchar;
+                psig->syscr += p->syscr + sig->syscr;
+                psig->syscw += p->syscw + sig->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+                psig->ioac.read_bytes +=
+                        p->ioac.read_bytes + sig->ioac.read_bytes;
+                psig->ioac.write_bytes +=
+                        p->ioac.write_bytes + sig->ioac.write_bytes;
+                psig->ioac.cancelled_write_bytes +=
+                                p->ioac.cancelled_write_bytes +
+                                sig->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
                spin_unlock_irq(&p->parent->sighand->siglock);
        }
diff --git a/kernel/fork.c b/kernel/fork.c
index 552c8d8e77ad..b99d73e971a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -93,6 +93,23 @@ int nr_processes(void)
 static struct kmem_cache *task_struct_cachep;
 #endif
+#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+{
+#ifdef CONFIG_DEBUG_STACK_USAGE
+        gfp_t mask = GFP_KERNEL | __GFP_ZERO;
+#else
+        gfp_t mask = GFP_KERNEL;
+#endif
+        return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+}
+static inline void free_thread_info(struct thread_info *ti)
+{
+        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+}
+#endif
 /* SLAB cache for signal_struct structures (tsk->signal) */
 static struct kmem_cache *signal_cachep;
@@ -383,7 +400,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        INIT_LIST_HEAD(&mm->mmlist);
        mm->flags = (current->mm) ? current->mm->flags
                                  : MMF_DUMP_FILTER_DEFAULT;
-        mm->core_waiters = 0;
+        mm->core_state = NULL;
        mm->nr_ptes = 0;
        set_mm_counter(mm, file_rss, 0);
        set_mm_counter(mm, anon_rss, 0);
@@ -457,7 +474,7 @@ EXPORT_SYMBOL_GPL(mmput);
 /**
 * get_task_mm - acquire a reference to the task's mm
 *
- * Returns %NULL if the task has no mm.  Checks PF_BORROWED_MM (meaning
+ * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
 * this kernel workthread has transiently adopted a user mm with use_mm,
 * to do its AIO) is not set and if so returns a reference to it, after
 * bumping up the use count.  User must release the mm via mmput()
@@ -470,7 +487,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
        task_lock(task);
        mm = task->mm;
        if (mm) {
-                if (task->flags & PF_BORROWED_MM)
+                if (task->flags & PF_KTHREAD)
                        mm = NULL;
                else
                        atomic_inc(&mm->mm_users);
@@ -795,6 +812,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+#ifdef CONFIG_TASK_XACCT
+        sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+        memset(&sig->ioac, 0, sizeof(sig->ioac));
+#endif
        sig->sum_sched_runtime = 0;
        INIT_LIST_HEAD(&sig->cpu_timers[0]);
        INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -1090,6 +1113,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (clone_flags & CLONE_THREAD)
                p->tgid = current->tgid;
+        if (current->nsproxy != p->nsproxy) {
+                retval = ns_cgroup_clone(p, pid);
+                if (retval)
+                        goto bad_fork_free_pid;
+        }
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
        /*
         * Clear TID on mm_release()?
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5bc6e5ecc493..f8914b92b664 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -260,9 +260,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
                }
        } else {
                if (desc->wake_depth == 0) {
-                        printk(KERN_WARNING "Unbalanced IRQ %d "
+                        WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
-                                        "wake disable\n", irq);
-                        WARN_ON(1);
                } else if (--desc->wake_depth == 0) {
                        ret = set_irq_wake_real(irq, on);
                        if (ret)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6fc0040f3e3a..38fc10ac7541 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -176,7 +176,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
        high = kallsyms_num_syms;
        while (high - low > 1) {
-                mid = (low + high) / 2;
+                mid = low + (high - low) / 2;
                if (kallsyms_addresses[mid] <= addr)
                        low = mid;
                else
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2989f67c4446..2456d1a0befb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -352,16 +352,17 @@ static inline void register_pm_notifier_callback(void) {}
 * @path: path to usermode executable
 * @argv: arg vector for process
 * @envp: environment for process
+ * @gfp_mask: gfp mask for memory allocation
 *
 * Returns either %NULL on allocation failure, or a subprocess_info
 * structure.  This should be passed to call_usermodehelper_exec to
 * exec the process and free the structure.
 */
-struct subprocess_info *call_usermodehelper_setup(char *path,
+struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
-                                                  char **argv, char **envp)
+                                                  char **envp, gfp_t gfp_mask)
 {
        struct subprocess_info *sub_info;
-        sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+        sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
        if (!sub_info)
                goto out;
@@ -494,7 +495,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
        struct subprocess_info *sub_info;
        int ret;
-        sub_info = call_usermodehelper_setup(path, argv, envp);
+        sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
        if (sub_info == NULL)
                return -ENOMEM;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1485ca8d0e00..75bc2cd9ebc6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -62,6 +62,7 @@
        addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
 #endif
+static int kprobes_initialized;
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
@@ -69,8 +70,15 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 static bool kprobe_enabled;
 DEFINE_MUTEX(kprobe_mutex);             /* Protects kprobe_table */
-DEFINE_SPINLOCK(kretprobe_lock);        /* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+static struct {
+        spinlock_t lock ____cacheline_aligned;
+} kretprobe_table_locks[KPROBE_TABLE_SIZE];
+static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+{
+        return &(kretprobe_table_locks[hash].lock);
+}
 /*
 * Normally, functions that we'd want to prohibit kprobes in, are marked
@@ -368,26 +376,53 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
        return;
 }
-/* Called with kretprobe_lock held */
 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
                                struct hlist_head *head)
 {
+        struct kretprobe *rp = ri->rp;
        /* remove rp inst off the rprobe_inst_table */
        hlist_del(&ri->hlist);
-        if (ri->rp) {
+        INIT_HLIST_NODE(&ri->hlist);
-                /* remove rp inst off the used list */
+        if (likely(rp)) {
-                hlist_del(&ri->uflist);
+                spin_lock(&rp->lock);
-                /* put rp inst back onto the free list */
+                hlist_add_head(&ri->hlist, &rp->free_instances);
-                INIT_HLIST_NODE(&ri->uflist);
+                spin_unlock(&rp->lock);
-                hlist_add_head(&ri->uflist, &ri->rp->free_instances);
        } else
                /* Unregistering */
                hlist_add_head(&ri->hlist, head);
 }
-struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
+void kretprobe_hash_lock(struct task_struct *tsk,
+                         struct hlist_head **head, unsigned long *flags)
 {
-        return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+        spinlock_t *hlist_lock;
+        *head = &kretprobe_inst_table[hash];
+        hlist_lock = kretprobe_table_lock_ptr(hash);
+        spin_lock_irqsave(hlist_lock, *flags);
+}
+void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
+{
+        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+        spin_lock_irqsave(hlist_lock, *flags);
+}
+void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
+{
+        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+        spinlock_t *hlist_lock;
+        hlist_lock = kretprobe_table_lock_ptr(hash);
+        spin_unlock_irqrestore(hlist_lock, *flags);
+}
+void kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+{
+        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+        spin_unlock_irqrestore(hlist_lock, *flags);
 }
 /*
@@ -401,17 +436,21 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
        struct kretprobe_instance *ri;
        struct hlist_head *head, empty_rp;
        struct hlist_node *node, *tmp;
-        unsigned long flags = 0;
+        unsigned long hash, flags = 0;
-        INIT_HLIST_HEAD(&empty_rp);
+        if (unlikely(!kprobes_initialized))
-        spin_lock_irqsave(&kretprobe_lock, flags);
+                /* Early boot.  kretprobe_table_locks not yet initialized. */
-        head = kretprobe_inst_table_head(tk);
+                return;
+        hash = hash_ptr(tk, KPROBE_HASH_BITS);
+        head = &kretprobe_inst_table[hash];
+        kretprobe_table_lock(hash, &flags);
        hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
                if (ri->task == tk)
                        recycle_rp_inst(ri, &empty_rp);
        }
-        spin_unlock_irqrestore(&kretprobe_lock, flags);
+        kretprobe_table_unlock(hash, &flags);
+        INIT_HLIST_HEAD(&empty_rp);
        hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
                hlist_del(&ri->hlist);
                kfree(ri);
@@ -423,24 +462,29 @@ static inline void free_rp_inst(struct kretprobe *rp)
        struct kretprobe_instance *ri;
        struct hlist_node *pos, *next;
-        hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) {
+        hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
-                hlist_del(&ri->uflist);
+                hlist_del(&ri->hlist);
                kfree(ri);
        }
 }
 static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
 {
-        unsigned long flags;
+        unsigned long flags, hash;
        struct kretprobe_instance *ri;
        struct hlist_node *pos, *next;
+        struct hlist_head *head;
        /* No race here */
-        spin_lock_irqsave(&kretprobe_lock, flags);
+        for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
-        hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
+                kretprobe_table_lock(hash, &flags);
-                ri->rp = NULL;
+                head = &kretprobe_inst_table[hash];
-                hlist_del(&ri->uflist);
+                hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+                        if (ri->rp == rp)
+                                ri->rp = NULL;
+                }
+                kretprobe_table_unlock(hash, &flags);
        }
-        spin_unlock_irqrestore(&kretprobe_lock, flags);
        free_rp_inst(rp);
 }
@@ -831,32 +875,37 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
                                           struct pt_regs *regs)
 {
        struct kretprobe *rp = container_of(p, struct kretprobe, kp);
-        unsigned long flags = 0;
+        unsigned long hash, flags = 0;
+        struct kretprobe_instance *ri;
        /*TODO: consider to only swap the RA after the last pre_handler fired */
-        spin_lock_irqsave(&kretprobe_lock, flags);
+        hash = hash_ptr(current, KPROBE_HASH_BITS);
+        spin_lock_irqsave(&rp->lock, flags);
        if (!hlist_empty(&rp->free_instances)) {
-                struct kretprobe_instance *ri;
                ri = hlist_entry(rp->free_instances.first,
-                                 struct kretprobe_instance, uflist);
+                                struct kretprobe_instance, hlist);
+                hlist_del(&ri->hlist);
+                spin_unlock_irqrestore(&rp->lock, flags);
                ri->rp = rp;
                ri->task = current;
                if (rp->entry_handler && rp->entry_handler(ri, regs)) {
-                        spin_unlock_irqrestore(&kretprobe_lock, flags);
+                        spin_unlock_irqrestore(&rp->lock, flags);
                        return 0;
                }
                arch_prepare_kretprobe(ri, regs);
                /* XXX(hch): why is there no hlist_move_head? */
-                hlist_del(&ri->uflist);
+                INIT_HLIST_NODE(&ri->hlist);
-                hlist_add_head(&ri->uflist, &ri->rp->used_instances);
+                kretprobe_table_lock(hash, &flags);
-                hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task));
+                hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
-        } else
+                kretprobe_table_unlock(hash, &flags);
+        } else {
                rp->nmissed++;
-        spin_unlock_irqrestore(&kretprobe_lock, flags);
+                spin_unlock_irqrestore(&rp->lock, flags);
+        }
        return 0;
 }
@@ -892,7 +941,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
                rp->maxactive = NR_CPUS;
 #endif
        }
-        INIT_HLIST_HEAD(&rp->used_instances);
+        spin_lock_init(&rp->lock);
        INIT_HLIST_HEAD(&rp->free_instances);
        for (i = 0; i < rp->maxactive; i++) {
                inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -901,8 +950,8 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
                        free_rp_inst(rp);
                        return -ENOMEM;
                }
-                INIT_HLIST_NODE(&inst->uflist);
+                INIT_HLIST_NODE(&inst->hlist);
-                hlist_add_head(&inst->uflist, &rp->free_instances);
+                hlist_add_head(&inst->hlist, &rp->free_instances);
        }
        rp->nmissed = 0;
@@ -1009,6 +1058,7 @@ static int __init init_kprobes(void)
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                INIT_HLIST_HEAD(&kprobe_table[i]);
                INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
+                spin_lock_init(&(kretprobe_table_locks[i].lock));
        }
        /*
@@ -1050,6 +1100,7 @@ static int __init init_kprobes(void)
        err = arch_init_kprobes();
        if (!err)
                err = register_die_notifier(&kprobe_exceptions_nb);
+        kprobes_initialized = (err == 0);
        if (!err)
                init_test_probes();
@@ -1286,13 +1337,8 @@ EXPORT_SYMBOL_GPL(register_jprobe);
 EXPORT_SYMBOL_GPL(unregister_jprobe);
 EXPORT_SYMBOL_GPL(register_jprobes);
 EXPORT_SYMBOL_GPL(unregister_jprobes);
-#ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(jprobe_return);
-#endif
-#ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(register_kretprobe);
 EXPORT_SYMBOL_GPL(unregister_kretprobe);
 EXPORT_SYMBOL_GPL(register_kretprobes);
 EXPORT_SYMBOL_GPL(unregister_kretprobes);
-#endif
diff --git a/kernel/marker.c b/kernel/marker.c
index 1abfb923b761..971da5317903 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -441,7 +441,7 @@ static int remove_marker(const char *name)
        hlist_del(&e->hlist);
        /* Make sure the call_rcu has been executed */
        if (e->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        kfree(e);
        return 0;
 }
@@ -476,7 +476,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
        hlist_del(&(*entry)->hlist);
        /* Make sure the call_rcu has been executed */
        if ((*entry)->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        kfree(*entry);
        *entry = e;
        trace_mark(core_marker_format, "name %s format %s",
@@ -655,7 +655,7 @@ int marker_probe_register(const char *name, const char *format,
         * make sure it's executed now.
         */
        if (entry->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        old = marker_entry_add_probe(entry, probe, probe_private);
        if (IS_ERR(old)) {
                ret = PTR_ERR(old);
@@ -670,10 +670,7 @@ int marker_probe_register(const char *name, const char *format,
        entry->rcu_pending = 1;
        /* write rcu_pending before calling the RCU callback */
        smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
+        call_rcu_sched(&entry->rcu, free_old_closure);
-        synchronize_sched();    /* Until we have the call_rcu_sched() */
-#endif
-        call_rcu(&entry->rcu, free_old_closure);
 end:
        mutex_unlock(&markers_mutex);
        return ret;
@@ -704,7 +701,7 @@ int marker_probe_unregister(const char *name,
        if (!entry)
                goto end;
        if (entry->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        old = marker_entry_remove_probe(entry, probe, probe_private);
        mutex_unlock(&markers_mutex);
        marker_update_probes();         /* may update entry */
@@ -716,10 +713,7 @@ int marker_probe_unregister(const char *name,
        entry->rcu_pending = 1;
        /* write rcu_pending before calling the RCU callback */
        smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
+        call_rcu_sched(&entry->rcu, free_old_closure);
-        synchronize_sched();    /* Until we have the call_rcu_sched() */
-#endif
-        call_rcu(&entry->rcu, free_old_closure);
        remove_marker(name);    /* Ignore busy error message */
        ret = 0;
 end:
@@ -786,7 +780,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
                goto end;
        }
        if (entry->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        old = marker_entry_remove_probe(entry, NULL, probe_private);
        mutex_unlock(&markers_mutex);
        marker_update_probes();         /* may update entry */
@@ -797,10 +791,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
        entry->rcu_pending = 1;
        /* write rcu_pending before calling the RCU callback */
        smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
+        call_rcu_sched(&entry->rcu, free_old_closure);
-        synchronize_sched();    /* Until we have the call_rcu_sched() */
-#endif
-        call_rcu(&entry->rcu, free_old_closure);
        remove_marker(entry->name);     /* Ignore busy error message */
 end:
        mutex_unlock(&markers_mutex);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 48d7ed6fc3a4..43c2111cd54d 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
+#include <linux/proc_fs.h>
 #include <linux/slab.h>
 #include <linux/nsproxy.h>
@@ -24,9 +25,12 @@ static inline struct ns_cgroup *cgroup_to_ns(
                            struct ns_cgroup, css);
 }
-int ns_cgroup_clone(struct task_struct *task)
+int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 {
-        return cgroup_clone(task, &ns_subsys);
+        char name[PROC_NUMBUF];
+        snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
+        return cgroup_clone(task, &ns_subsys, name);
 }
 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index adc785146a1c..21575fc46d05 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -157,12 +157,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
                goto out;
        }
-        err = ns_cgroup_clone(tsk);
-        if (err) {
-                put_nsproxy(new_ns);
-                goto out;
-        }
        tsk->nsproxy = new_ns;
 out:
@@ -209,7 +203,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
                goto out;
        }
-        err = ns_cgroup_clone(current);
+        err = ns_cgroup_clone(current, task_pid(current));
        if (err)
                put_nsproxy(*new_nsp);
diff --git a/kernel/panic.c b/kernel/panic.c
index 425567f45b9f..12c5a0a6c89b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -318,6 +318,28 @@ void warn_on_slowpath(const char *file, int line)
        add_taint(TAINT_WARN);
 }
 EXPORT_SYMBOL(warn_on_slowpath);
+void warn_slowpath(const char *file, int line, const char *fmt, ...)
+{
+        va_list args;
+        char function[KSYM_SYMBOL_LEN];
+        unsigned long caller = (unsigned long)__builtin_return_address(0);
+        sprint_symbol(function, caller);
+        printk(KERN_WARNING "------------[ cut here ]------------\n");
+        printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
+                line, function);
+        va_start(args, fmt);
+        vprintk(fmt, args);
+        va_end(args);
+        print_modules();
+        dump_stack();
+        print_oops_end_marker();
+        add_taint(TAINT_WARN);
+}
+EXPORT_SYMBOL(warn_slowpath);
 #endif
 #ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/pid.c b/kernel/pid.c
index 30bd5d4b2ac7..064e76afa507 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -309,12 +309,6 @@ struct pid *find_vpid(int nr)
 }
 EXPORT_SYMBOL_GPL(find_vpid);
-struct pid *find_pid(int nr)
-{
-        return find_pid_ns(nr, &init_pid_ns);
-}
-EXPORT_SYMBOL_GPL(find_pid);
 /*
 * attach_pid() must be called with the tasklist_lock write-held.
 */
@@ -435,6 +429,7 @@ struct pid *find_get_pid(pid_t nr)
        return pid;
 }
+EXPORT_SYMBOL_GPL(find_get_pid);
 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
 {
@@ -482,7 +477,7 @@ EXPORT_SYMBOL(task_session_nr_ns);
 /*
 * Used by proc to find the first pid that is greater then or equal to nr.
 *
- * If there is a pid at nr this function is exactly the same as find_pid.
+ * If there is a pid at nr this function is exactly the same as find_pid_ns.
 */
 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 {
@@ -497,7 +492,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
        return pid;
 }
-EXPORT_SYMBOL_GPL(find_get_pid);
 /*
 * The pid hash table is scaled according to the amount of memory in the
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 98702b4b8851..ea567b78d1aa 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,6 +12,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
+#include <linux/acct.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -71,7 +72,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
        struct pid_namespace *ns;
        int i;
-        ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
+        ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
        if (ns == NULL)
                goto out;
@@ -84,17 +85,13 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
                goto out_free_map;
        kref_init(&ns->kref);
-        ns->last_pid = 0;
-        ns->child_reaper = NULL;
        ns->level = level;
        set_bit(0, ns->pidmap[0].page);
        atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-        for (i = 1; i < PIDMAP_ENTRIES; i++) {
+        for (i = 1; i < PIDMAP_ENTRIES; i++)
-                ns->pidmap[i].page = NULL;
                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-        }
        return ns;
@@ -185,6 +182,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        /* Child reaper for the pid namespace is going away */
        pid_ns->child_reaper = NULL;
+        acct_exit_ns(pid_ns);
        return;
 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dbd8398ddb0b..9a21681aa80f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -449,9 +449,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
                spin_unlock_irqrestore(&idr_lock, flags);
        }
        sigqueue_free(tmr->sigq);
-        if (unlikely(tmr->it_process) &&
-            tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-                put_task_struct(tmr->it_process);
        kmem_cache_free(posix_timers_cache, tmr);
 }
@@ -856,11 +853,10 @@ retry_delete:
         * This keeps any tasks waiting on the spin lock from thinking
         * they got something (see the lock code above).
         */
-        if (timer->it_process) {
+        if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-                if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+                put_task_struct(timer->it_process);
-                        put_task_struct(timer->it_process);
+        timer->it_process = NULL;
-                timer->it_process = NULL;
-        }
        unlock_timer(timer, flags);
        release_posix_timer(timer, IT_ID_SET);
        return 0;
@@ -885,11 +881,10 @@ retry_delete:
         * This keeps any tasks waiting on the spin lock from thinking
         * they got something (see the lock code above).
         */
-        if (timer->it_process) {
+        if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-                if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+                put_task_struct(timer->it_process);
-                        put_task_struct(timer->it_process);
+        timer->it_process = NULL;
-                timer->it_process = NULL;
-        }
        unlock_timer(timer, flags);
        release_posix_timer(timer, IT_ID_SET);
 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 3f7a2a94583b..a7f7559c5f6c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1308,6 +1308,8 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 }
 #if defined CONFIG_PRINTK
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
 /*
 * printk rate limiting, lifted from the networking subsystem.
 *
@@ -1315,22 +1317,9 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 * every printk_ratelimit_jiffies to make a denial-of-service
 * attack impossible.
 */
-int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
-{
-        return __ratelimit(ratelimit_jiffies, ratelimit_burst);
-}
-EXPORT_SYMBOL(__printk_ratelimit);
-/* minimum time in jiffies between messages */
-int printk_ratelimit_jiffies = 5 * HZ;
-/* number of messages we send before ratelimiting */
-int printk_ratelimit_burst = 10;
 int printk_ratelimit(void)
 {
-        return __printk_ratelimit(printk_ratelimit_jiffies,
+        return __ratelimit(&printk_ratelimit_state);
-                                printk_ratelimit_burst);
 }
 EXPORT_SYMBOL(printk_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
index 58926411eb2a..cd26bed4cc26 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -112,8 +112,6 @@ void __init profile_init(void)
 /* Profile event notifications */
-#ifdef CONFIG_PROFILING
 static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
 static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
 static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
@@ -203,8 +201,6 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
 }
 EXPORT_SYMBOL_GPL(unregister_timer_hook);
-#endif /* CONFIG_PROFILING */
 #ifdef CONFIG_SMP
 /*
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d3c61b4ebef2..f275c8eca772 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/res_counter.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 void res_counter_init(struct res_counter *counter)
 {
@@ -102,44 +103,37 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
        return *res_counter_member(counter, member);
 }
-ssize_t res_counter_write(struct res_counter *counter, int member,
+int res_counter_memparse_write_strategy(const char *buf,
-                const char __user *userbuf, size_t nbytes, loff_t *pos,
+                                        unsigned long long *res)
-                int (*write_strategy)(char *st_buf, unsigned long long *val))
 {
-        int ret;
+        char *end;
-        char *buf, *end;
+        /* FIXME - make memparse() take const char* args */
-        unsigned long flags;
+        *res = memparse((char *)buf, &end);
-        unsigned long long tmp, *val;
+        if (*end != '\0')
+                return -EINVAL;
-        buf = kmalloc(nbytes + 1, GFP_KERNEL);
-        ret = -ENOMEM;
-        if (buf == NULL)
-                goto out;
-        buf[nbytes] = '\0';
+        *res = PAGE_ALIGN(*res);
-        ret = -EFAULT;
+        return 0;
-        if (copy_from_user(buf, userbuf, nbytes))
+}
-                goto out_free;
-        ret = -EINVAL;
+int res_counter_write(struct res_counter *counter, int member,
+                      const char *buf, write_strategy_fn write_strategy)
+{
+        char *end;
+        unsigned long flags;
+        unsigned long long tmp, *val;
-        strstrip(buf);
        if (write_strategy) {
-                if (write_strategy(buf, &tmp)) {
+                if (write_strategy(buf, &tmp))
-                        goto out_free;
+                        return -EINVAL;
-                }
        } else {
                tmp = simple_strtoull(buf, &end, 10);
                if (*end != '\0')
-                        goto out_free;
+                        return -EINVAL;
        }
        spin_lock_irqsave(&counter->lock, flags);
        val = res_counter_member(counter, member);
        *val = tmp;
        spin_unlock_irqrestore(&counter->lock, flags);
-        ret = nbytes;
+        return 0;
-out_free:
-        kfree(buf);
-out:
-        return ret;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 6acf749d3336..0047bd9b96aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4046,6 +4046,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
                cpustat->nice = cputime64_add(cpustat->nice, tmp);
        else
                cpustat->user = cputime64_add(cpustat->user, tmp);
+        /* Account for user time used */
+        acct_update_integrals(p);
 }
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index 6c0958e52ea7..82c3545596c5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -338,13 +338,9 @@ unblock_all_signals(void)
        spin_unlock_irqrestore(&current->sighand->siglock, flags);
 }
-static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 {
        struct sigqueue *q, *first = NULL;
-        int still_pending = 0;
-        if (unlikely(!sigismember(&list->signal, sig)))
-                return 0;
        /*
         * Collect the siginfo appropriate to this signal.  Check if
@@ -352,33 +348,30 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
        */
        list_for_each_entry(q, &list->list, list) {
                if (q->info.si_signo == sig) {
-                        if (first) {
+                        if (first)
-                                still_pending = 1;
+                                goto still_pending;
-                                break;
-                        }
                        first = q;
                }
        }
+        sigdelset(&list->signal, sig);
        if (first) {
+still_pending:
                list_del_init(&first->list);
                copy_siginfo(info, &first->info);
                __sigqueue_free(first);
-                if (!still_pending)
-                        sigdelset(&list->signal, sig);
        } else {
                /* Ok, it wasn't in the queue.  This must be
                   a fast-pathed signal or we must have been
                   out of queue space.  So zero out the info.
                 */
-                sigdelset(&list->signal, sig);
                info->si_signo = sig;
                info->si_errno = 0;
                info->si_code = 0;
                info->si_pid = 0;
                info->si_uid = 0;
        }
-        return 1;
 }
 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
@@ -396,8 +389,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
                        }
                }
-                if (!collect_signal(sig, pending, info))
+                collect_signal(sig, pending, info);
-                        sig = 0;
        }
        return sig;
@@ -462,8 +454,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                 * is to alert stop-signal processing code when another
                 * processor has come along and cleared the flag.
                 */
-                if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
+                tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
-                        tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
        }
        if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
                /*
@@ -1125,7 +1116,7 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
 * is probably wrong.  Should make it like BSD or SYSV.
 */
-static int kill_something_info(int sig, struct siginfo *info, int pid)
+static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
 {
        int ret;
@@ -1237,17 +1228,6 @@ int kill_pid(struct pid *pid, int sig, int priv)
 }
 EXPORT_SYMBOL(kill_pid);
-int
-kill_proc(pid_t pid, int sig, int priv)
-{
-        int ret;
-        rcu_read_lock();
-        ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
-        rcu_read_unlock();
-        return ret;
-}
 /*
 * These functions support sending signals using preallocated sigqueue
 * structures.  This is needed "because realtime applications cannot
@@ -1379,10 +1359,9 @@ void do_notify_parent(struct task_struct *tsk, int sig)
        info.si_uid = tsk->uid;
-        /* FIXME: find out whether or not this is supposed to be c*time. */
+        info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
-        info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime,
                                                       tsk->signal->utime));
-        info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime,
+        info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
                                                       tsk->signal->stime));
        info.si_status = tsk->exit_code & 0x7f;
@@ -1450,9 +1429,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
        info.si_uid = tsk->uid;
-        /* FIXME: find out whether or not this is supposed to be c*time. */
+        info.si_utime = cputime_to_clock_t(tsk->utime);
-        info.si_utime = cputime_to_jiffies(tsk->utime);
+        info.si_stime = cputime_to_clock_t(tsk->stime);
-        info.si_stime = cputime_to_jiffies(tsk->stime);
        info.si_code = why;
        switch (why) {
@@ -1491,10 +1469,10 @@ static inline int may_ptrace_stop(void)
         * is a deadlock situation, and pointless because our tracer
         * is dead so don't allow us to stop.
         * If SIGKILL was already sent before the caller unlocked
-         * ->siglock we must see ->core_waiters != 0. Otherwise it
+         * ->siglock we must see ->core_state != NULL. Otherwise it
         * is safe to enter schedule().
         */
-        if (unlikely(current->mm->core_waiters) &&
+        if (unlikely(current->mm->core_state) &&
            unlikely(current->mm == current->parent->mm))
                return 0;
@@ -1507,9 +1485,8 @@ static inline int may_ptrace_stop(void)
 */
 static int sigkill_pending(struct task_struct *tsk)
 {
-        return ((sigismember(&tsk->pending.signal, SIGKILL) ||
+        return  sigismember(&tsk->pending.signal, SIGKILL) ||
-                 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) &&
+                sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
-                !unlikely(sigismember(&tsk->blocked, SIGKILL)));
 }
 /*
@@ -1525,8 +1502,6 @@ static int sigkill_pending(struct task_struct *tsk)
 */
 static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 {
-        int killed = 0;
        if (arch_ptrace_stop_needed(exit_code, info)) {
                /*
                 * The arch code has something special to do before a
@@ -1542,7 +1517,8 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
                spin_unlock_irq(&current->sighand->siglock);
                arch_ptrace_stop(exit_code, info);
                spin_lock_irq(&current->sighand->siglock);
-                killed = sigkill_pending(current);
+                if (sigkill_pending(current))
+                        return;
        }
        /*
@@ -1559,7 +1535,7 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
        __set_current_state(TASK_TRACED);
        spin_unlock_irq(&current->sighand->siglock);
        read_lock(&tasklist_lock);
-        if (!unlikely(killed) && may_ptrace_stop()) {
+        if (may_ptrace_stop()) {
                do_notify_parent_cldstop(current, CLD_TRAPPED);
                read_unlock(&tasklist_lock);
                schedule();
@@ -1658,8 +1634,7 @@ static int do_signal_stop(int signr)
        } else {
                struct task_struct *t;
-                if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE))
+                if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
-                                         != SIGNAL_STOP_DEQUEUED) ||
                    unlikely(signal_group_exit(sig)))
                        return 0;
                /*
@@ -1920,7 +1895,6 @@ EXPORT_SYMBOL(recalc_sigpending);
 EXPORT_SYMBOL_GPL(dequeue_signal);
 EXPORT_SYMBOL(flush_signals);
 EXPORT_SYMBOL(force_sig);
-EXPORT_SYMBOL(kill_proc);
 EXPORT_SYMBOL(ptrace_notify);
 EXPORT_SYMBOL(send_sig);
 EXPORT_SYMBOL(send_sig_info);
@@ -2196,7 +2170,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
 }
 asmlinkage long
-sys_kill(int pid, int sig)
+sys_kill(pid_t pid, int sig)
 {
        struct siginfo info;
@@ -2209,7 +2183,7 @@ sys_kill(int pid, int sig)
        return kill_something_info(sig, &info, pid);
 }
-static int do_tkill(int tgid, int pid, int sig)
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
 {
        int error;
        struct siginfo info;
@@ -2255,7 +2229,7 @@ static int do_tkill(int tgid, int pid, int sig)
 *  exists but it's not belonging to the target process anymore. This
 *  method solves the problem of threads exiting and PIDs getting reused.
 */
-asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig)
 {
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
@@ -2268,7 +2242,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
 *  Send a signal to only one task, even if it's a CLONE_THREAD task.
 */
 asmlinkage long
-sys_tkill(int pid, int sig)
+sys_tkill(pid_t pid, int sig)
 {
        /* This is only valid for single tasks */
        if (pid <= 0)
@@ -2278,7 +2252,7 @@ sys_tkill(int pid, int sig)
 }
 asmlinkage long
-sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
+sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo)
 {
        siginfo_t info;
diff --git a/kernel/sys.c b/kernel/sys.c
index 14e97282eb6c..0c9d3fa1f5ff 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1343,8 +1343,6 @@ EXPORT_SYMBOL(in_egroup_p);
 DECLARE_RWSEM(uts_sem);
-EXPORT_SYMBOL(uts_sem);
 asmlinkage long sys_newuname(struct new_utsname __user * name)
 {
        int errno = 0;
@@ -1795,7 +1793,7 @@ int orderly_poweroff(bool force)
                goto out;
        }
-        info = call_usermodehelper_setup(argv[0], argv, envp);
+        info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
        if (info == NULL) {
                argv_free(argv);
                goto out;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bd66ac5406f3..08d6e1bb99ac 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -57,6 +57,7 @@ cond_syscall(compat_sys_set_robust_list);
 cond_syscall(sys_get_robust_list);
 cond_syscall(compat_sys_get_robust_list);
 cond_syscall(sys_epoll_create);
+cond_syscall(sys_epoll_create1);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
 cond_syscall(sys_epoll_pwait);
@@ -159,6 +160,7 @@ cond_syscall(sys_ioprio_get);
 cond_syscall(sys_signalfd);
 cond_syscall(sys_signalfd4);
 cond_syscall(compat_sys_signalfd);
+cond_syscall(compat_sys_signalfd4);
 cond_syscall(sys_timerfd_create);
 cond_syscall(sys_timerfd_settime);
 cond_syscall(sys_timerfd_gettime);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1a8299d1fe59..35a50db9b6ce 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -624,7 +624,7 @@ static struct ctl_table kern_table[] = {
        {
                .ctl_name       = KERN_PRINTK_RATELIMIT,
                .procname       = "printk_ratelimit",
-                .data           = &printk_ratelimit_jiffies,
+                .data           = &printk_ratelimit_state.interval,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec_jiffies,
@@ -633,7 +633,7 @@ static struct ctl_table kern_table[] = {
        {
                .ctl_name       = KERN_PRINTK_RATELIMIT_BURST,
                .procname       = "printk_ratelimit_burst",
-                .data           = &printk_ratelimit_burst,
+                .data           = &printk_ratelimit_state.burst,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c09350d564f2..c35da23ab8fb 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1532,6 +1532,8 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                        sysctl_check_leaf(namespaces, table, &fail);
                }
                sysctl_check_bin_path(table, &fail);
+                if (table->mode > 0777)
+                        set_fail(&fail, table, "bogus .mode");
                if (fail) {
                        set_fail(&fail, table, NULL);
                        error = -EINVAL;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 06b17547f4e7..bd6be76303cf 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -35,7 +35,7 @@
 */
 #define TASKSTATS_CPUMASK_MAXLEN        (100+6*NR_CPUS)
-static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static DEFINE_PER_CPU(__u32, taskstats_seqnum);
 static int family_registered;
 struct kmem_cache *taskstats_cache;
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 63528086337c..ce2d723c10e1 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -161,7 +161,7 @@ static void timer_notify(struct pt_regs *regs, int cpu)
                __trace_special(tr, data, 2, regs->ip, 0);
                while (i < sample_max_depth) {
-                        frame.next_fp = 0;
+                        frame.next_fp = NULL;
                        frame.return_address = 0;
                        if (!copy_stack_frame(fp, &frame))
                                break;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 4ab1b584961b..3da47ccdc5e5 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -28,14 +28,14 @@
 void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 {
        struct timespec uptime, ts;
-        s64 ac_etime;
+        u64 ac_etime;
        BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
        /* calculate task elapsed time in timespec */
        do_posix_clock_monotonic_gettime(&uptime);
        ts = timespec_sub(uptime, tsk->start_time);
-        /* rebase elapsed time to usec */
+        /* rebase elapsed time to usec (should never be negative) */
        ac_etime = timespec_to_ns(&ts);
        do_div(ac_etime, NSEC_PER_USEC);
        stats->ac_etime = ac_etime;
@@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
        struct mm_struct *mm;
-        /* convert pages-jiffies to Mbyte-usec */
+        /* convert pages-usec to Mbyte-usec */
-        stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
+        stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
-        stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
+        stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
        mm = get_task_mm(p);
        if (mm) {
                /* adjust to KB unit */
@@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 void acct_update_integrals(struct task_struct *tsk)
 {
        if (likely(tsk->mm)) {
-                long delta = cputime_to_jiffies(
+                cputime_t time, dtime;
-                        cputime_sub(tsk->stime, tsk->acct_stimexpd));
+                struct timeval value;
+                u64 delta;
+                time = tsk->stime + tsk->utime;
+                dtime = cputime_sub(time, tsk->acct_timexpd);
+                jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
+                delta = value.tv_sec;
+                delta = delta * USEC_PER_SEC + value.tv_usec;
                if (delta == 0)
                        return;
-                tsk->acct_stimexpd = tsk->stime;
+                tsk->acct_timexpd = time;
                tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
                tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
        }
@@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk)
 */
 void acct_clear_integrals(struct task_struct *tsk)
 {
-        tsk->acct_stimexpd = 0;
+        tsk->acct_timexpd = 0;
        tsk->acct_rss_mem1 = 0;
        tsk->acct_vm_mem1 = 0;
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6fd158b21026..ec7e4f62aaff 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -125,7 +125,7 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 }
 static void insert_work(struct cpu_workqueue_struct *cwq,
-                                struct work_struct *work, int tail)
+                        struct work_struct *work, struct list_head *head)
 {
        set_wq_data(work, cwq);
        /*
@@ -133,10 +133,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
         * result of list_add() below, see try_to_grab_pending().
         */
        smp_wmb();
-        if (tail)
+        list_add_tail(&work->entry, head);
-                list_add_tail(&work->entry, &cwq->worklist);
-        else
-                list_add(&work->entry, &cwq->worklist);
        wake_up(&cwq->more_work);
 }
@@ -146,7 +143,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
        unsigned long flags;
        spin_lock_irqsave(&cwq->lock, flags);
-        insert_work(cwq, work, 1);
+        insert_work(cwq, work, &cwq->worklist);
        spin_unlock_irqrestore(&cwq->lock, flags);
 }
@@ -162,14 +159,11 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 */
 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-        int ret = 0;
+        int ret;
+        ret = queue_work_on(get_cpu(), wq, work);
+        put_cpu();
-        if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
-                BUG_ON(!list_empty(&work->entry));
-                __queue_work(wq_per_cpu(wq, get_cpu()), work);
-                put_cpu();
-                ret = 1;
-        }
        return ret;
 }
 EXPORT_SYMBOL_GPL(queue_work);
@@ -361,14 +355,14 @@ static void wq_barrier_func(struct work_struct *work)
 }
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
-                                        struct wq_barrier *barr, int tail)
+                        struct wq_barrier *barr, struct list_head *head)
 {
        INIT_WORK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
        init_completion(&barr->done);
-        insert_work(cwq, &barr->work, tail);
+        insert_work(cwq, &barr->work, head);
 }
 static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -388,7 +382,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
                active = 0;
                spin_lock_irq(&cwq->lock);
                if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
-                        insert_wq_barrier(cwq, &barr, 1);
+                        insert_wq_barrier(cwq, &barr, &cwq->worklist);
                        active = 1;
                }
                spin_unlock_irq(&cwq->lock);
@@ -426,6 +420,57 @@ void flush_workqueue(struct workqueue_struct *wq)
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
+/**
+ * flush_work - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
+ *
+ * Returns false if @work has already terminated.
+ *
+ * It is expected that, prior to calling flush_work(), the caller has
+ * arranged for the work to not be requeued, otherwise it doesn't make
+ * sense to use this function.
+ */
+int flush_work(struct work_struct *work)
+{
+        struct cpu_workqueue_struct *cwq;
+        struct list_head *prev;
+        struct wq_barrier barr;
+        might_sleep();
+        cwq = get_wq_data(work);
+        if (!cwq)
+                return 0;
+        lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+        prev = NULL;
+        spin_lock_irq(&cwq->lock);
+        if (!list_empty(&work->entry)) {
+                /*
+                 * See the comment near try_to_grab_pending()->smp_rmb().
+                 * If it was re-queued under us we are not going to wait.
+                 */
+                smp_rmb();
+                if (unlikely(cwq != get_wq_data(work)))
+                        goto out;
+                prev = &work->entry;
+        } else {
+                if (cwq->current_work != work)
+                        goto out;
+                prev = &cwq->worklist;
+        }
+        insert_wq_barrier(cwq, &barr, prev->next);
+out:
+        spin_unlock_irq(&cwq->lock);
+        if (!prev)
+                return 0;
+        wait_for_completion(&barr.done);
+        return 1;
+}
+EXPORT_SYMBOL_GPL(flush_work);
 /*
 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
 * so this work can't be re-armed in any way.
@@ -473,7 +518,7 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
        spin_lock_irq(&cwq->lock);
        if (unlikely(cwq->current_work == work)) {
-                insert_wq_barrier(cwq, &barr, 0);
+                insert_wq_barrier(cwq, &barr, cwq->worklist.next);
                running = 1;
        }
        spin_unlock_irq(&cwq->lock);
@@ -644,10 +689,10 @@ int schedule_on_each_cpu(work_func_t func)
                struct work_struct *work = per_cpu_ptr(works, cpu);
                INIT_WORK(work, func);
-                set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
+                schedule_work_on(cpu, work);
-                __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
        }
-        flush_workqueue(keventd_wq);
+        for_each_online_cpu(cpu)
+                flush_work(per_cpu_ptr(works, cpu));
        put_online_cpus();
        free_percpu(works);
        return 0;
@@ -784,7 +829,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
                err = create_workqueue_thread(cwq, singlethread_cpu);
                start_workqueue_thread(cwq, -1);
        } else {
-                get_online_cpus();
+                cpu_maps_update_begin();
                spin_lock(&workqueue_lock);
                list_add(&wq->list, &workqueues);
                spin_unlock(&workqueue_lock);
@@ -796,7 +841,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
                        err = create_workqueue_thread(cwq, cpu);
                        start_workqueue_thread(cwq, cpu);
                }
-                put_online_cpus();
+                cpu_maps_update_done();
        }
        if (err) {
@@ -810,8 +855,8 @@ EXPORT_SYMBOL_GPL(__create_workqueue_key);
 static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
        /*
-         * Our caller is either destroy_workqueue() or CPU_DEAD,
+         * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
-         * get_online_cpus() protects cwq->thread.
+         * cpu_add_remove_lock protects cwq->thread.
         */
        if (cwq->thread == NULL)
                return;
@@ -821,7 +866,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
        flush_cpu_workqueue(cwq);
        /*
-         * If the caller is CPU_DEAD and cwq->worklist was not empty,
+         * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
         * a concurrent flush_workqueue() can insert a barrier after us.
         * However, in that case run_workqueue() won't return and check
         * kthread_should_stop() until it flushes all work_struct's.
@@ -845,14 +890,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
        const cpumask_t *cpu_map = wq_cpu_map(wq);
        int cpu;
-        get_online_cpus();
+        cpu_maps_update_begin();
        spin_lock(&workqueue_lock);
        list_del(&wq->list);
        spin_unlock(&workqueue_lock);
        for_each_cpu_mask_nr(cpu, *cpu_map)
                cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
-        put_online_cpus();
+        cpu_maps_update_done();
        free_percpu(wq->cpu_wq);
        kfree(wq);
@@ -866,6 +911,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        unsigned int cpu = (unsigned long)hcpu;
        struct cpu_workqueue_struct *cwq;
        struct workqueue_struct *wq;
+        int ret = NOTIFY_OK;
        action &= ~CPU_TASKS_FROZEN;
@@ -873,7 +919,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        case CPU_UP_PREPARE:
                cpu_set(cpu, cpu_populated_map);
        }
+undo:
        list_for_each_entry(wq, &workqueues, list) {
                cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -883,7 +929,9 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                                break;
                        printk(KERN_ERR "workqueue [%s] for %i failed\n",
                                wq->name, cpu);
-                        return NOTIFY_BAD;
+                        action = CPU_UP_CANCELED;
+                        ret = NOTIFY_BAD;
+                        goto undo;
                case CPU_ONLINE:
                        start_workqueue_thread(cwq, cpu);
@@ -891,7 +939,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                case CPU_UP_CANCELED:
                        start_workqueue_thread(cwq, -1);
-                case CPU_DEAD:
+                case CPU_POST_DEAD:
                        cleanup_workqueue_thread(cwq);
                        break;
                }
@@ -899,11 +947,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_CANCELED:
-        case CPU_DEAD:
+        case CPU_POST_DEAD:
                cpu_clear(cpu, cpu_populated_map);
        }
-        return NOTIFY_OK;
+        return ret;
 }
 void __init init_workqueues(void)