73 files changed, 3112 insertions, 1897 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 526128a2e622..382dd5a8b2d7 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
        default 1000 if HZ_1000
 config SCHED_HRTICK
-        def_bool HIGH_RES_TIMERS && X86
+        def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS
diff --git a/kernel/Makefile b/kernel/Makefile
index 985ddb7da4d0..4e1d7df7c3e2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #
-obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
+obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            cpu.o exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o \
@@ -11,6 +11,8 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o
+CFLAGS_REMOVE_sched.o = -mno-spe
 ifdef CONFIG_FTRACE
 # Do not trace debug files and internal ftrace files
 CFLAGS_REMOVE_lockdep.o = -pg
@@ -22,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
+obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
@@ -81,6 +84,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FTRACE) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 91e1cfd734d2..dd68b9059418 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,37 +75,39 @@ int acct_parm[3] = {4, 2, 30};
 /*
 * External references and all of the globals.
 */
-static void do_acct_process(struct pid_namespace *ns, struct file *);
+static void do_acct_process(struct bsd_acct_struct *acct,
+                struct pid_namespace *ns, struct file *);
 /*
 * This structure is used so that all the data protected by lock
 * can be placed in the same cache line as the lock.  This primes
 * the cache line to have the data after getting the lock.
 */
-struct acct_glbs {
+struct bsd_acct_struct {
-        spinlock_t              lock;
        volatile int            active;
        volatile int            needcheck;
        struct file             *file;
        struct pid_namespace    *ns;
        struct timer_list       timer;
+        struct list_head        list;
 };
-static struct acct_glbs acct_globals __cacheline_aligned =
+static DEFINE_SPINLOCK(acct_lock);
-        {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
+static LIST_HEAD(acct_list);
 /*
 * Called whenever the timer says to check the free space.
 */
-static void acct_timeout(unsigned long unused)
+static void acct_timeout(unsigned long x)
 {
-        acct_globals.needcheck = 1;
+        struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
+        acct->needcheck = 1;
 }
 /*
 * Check the amount of free space and suspend/resume accordingly.
 */
-static int check_free_space(struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
 {
        struct kstatfs sbuf;
        int res;
@@ -113,11 +115,11 @@ static int check_free_space(struct file *file)
        sector_t resume;
        sector_t suspend;
-        spin_lock(&acct_globals.lock);
+        spin_lock(&acct_lock);
-        res = acct_globals.active;
+        res = acct->active;
-        if (!file || !acct_globals.needcheck)
+        if (!file || !acct->needcheck)
                goto out;
-        spin_unlock(&acct_globals.lock);
+        spin_unlock(&acct_lock);
        /* May block */
        if (vfs_statfs(file->f_path.dentry, &sbuf))
@@ -136,35 +138,35 @@ static int check_free_space(struct file *file)
                act = 0;
        /*
-         * If some joker switched acct_globals.file under us we'ld better be
+         * If some joker switched acct->file under us we'ld better be
         * silent and _not_ touch anything.
         */
-        spin_lock(&acct_globals.lock);
+        spin_lock(&acct_lock);
-        if (file != acct_globals.file) {
+        if (file != acct->file) {
                if (act)
                        res = act>0;
                goto out;
        }
-        if (acct_globals.active) {
+        if (acct->active) {
                if (act < 0) {
-                        acct_globals.active = 0;
+                        acct->active = 0;
                        printk(KERN_INFO "Process accounting paused\n");
                }
        } else {
                if (act > 0) {
-                        acct_globals.active = 1;
+                        acct->active = 1;
                        printk(KERN_INFO "Process accounting resumed\n");
                }
        }
-        del_timer(&acct_globals.timer);
+        del_timer(&acct->timer);
-        acct_globals.needcheck = 0;
+        acct->needcheck = 0;
-        acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+        acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-        add_timer(&acct_globals.timer);
+        add_timer(&acct->timer);
-        res = acct_globals.active;
+        res = acct->active;
 out:
-        spin_unlock(&acct_globals.lock);
+        spin_unlock(&acct_lock);
        return res;
 }
@@ -172,39 +174,41 @@ out:
 * Close the old accounting file (if currently open) and then replace
 * it with file (if non-NULL).
 *
- * NOTE: acct_globals.lock MUST be held on entry and exit.
+ * NOTE: acct_lock MUST be held on entry and exit.
 */
-static void acct_file_reopen(struct file *file)
+static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
+                struct pid_namespace *ns)
 {
        struct file *old_acct = NULL;
        struct pid_namespace *old_ns = NULL;
-        if (acct_globals.file) {
+        if (acct->file) {
-                old_acct = acct_globals.file;
+                old_acct = acct->file;
-                old_ns = acct_globals.ns;
+                old_ns = acct->ns;
-                del_timer(&acct_globals.timer);
+                del_timer(&acct->timer);
-                acct_globals.active = 0;
+                acct->active = 0;
-                acct_globals.needcheck = 0;
+                acct->needcheck = 0;
-                acct_globals.file = NULL;
+                acct->file = NULL;
+                acct->ns = NULL;
+                list_del(&acct->list);
        }
        if (file) {
-                acct_globals.file = file;
+                acct->file = file;
-                acct_globals.ns = get_pid_ns(task_active_pid_ns(current));
+                acct->ns = ns;
-                acct_globals.needcheck = 0;
+                acct->needcheck = 0;
-                acct_globals.active = 1;
+                acct->active = 1;
+                list_add(&acct->list, &acct_list);
                /* It's been deleted if it was used before so this is safe */
-                init_timer(&acct_globals.timer);
+                setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
-                acct_globals.timer.function = acct_timeout;
+                acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-                acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+                add_timer(&acct->timer);
-                add_timer(&acct_globals.timer);
        }
        if (old_acct) {
                mnt_unpin(old_acct->f_path.mnt);
-                spin_unlock(&acct_globals.lock);
+                spin_unlock(&acct_lock);
-                do_acct_process(old_ns, old_acct);
+                do_acct_process(acct, old_ns, old_acct);
                filp_close(old_acct, NULL);
-                put_pid_ns(old_ns);
+                spin_lock(&acct_lock);
-                spin_lock(&acct_globals.lock);
        }
 }
@@ -212,6 +216,8 @@ static int acct_on(char *name)
 {
        struct file *file;
        int error;
+        struct pid_namespace *ns;
+        struct bsd_acct_struct *acct = NULL;
        /* Difference from BSD - they don't do O_APPEND */
        file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
@@ -228,18 +234,34 @@ static int acct_on(char *name)
                return -EIO;
        }
+        ns = task_active_pid_ns(current);
+        if (ns->bacct == NULL) {
+                acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+                if (acct == NULL) {
+                        filp_close(file, NULL);
+                        return -ENOMEM;
+                }
+        }
        error = security_acct(file);
        if (error) {
+                kfree(acct);
                filp_close(file, NULL);
                return error;
        }
-        spin_lock(&acct_globals.lock);
+        spin_lock(&acct_lock);
+        if (ns->bacct == NULL) {
+                ns->bacct = acct;
+                acct = NULL;
+        }
        mnt_pin(file->f_path.mnt);
-        acct_file_reopen(file);
+        acct_file_reopen(ns->bacct, file, ns);
-        spin_unlock(&acct_globals.lock);
+        spin_unlock(&acct_lock);
        mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+        kfree(acct);
        return 0;
 }
@@ -269,11 +291,17 @@ asmlinkage long sys_acct(const char __user *name)
                error = acct_on(tmp);
                putname(tmp);
        } else {
+                struct bsd_acct_struct *acct;
+                acct = task_active_pid_ns(current)->bacct;
+                if (acct == NULL)
+                        return 0;
                error = security_acct(NULL);
                if (!error) {
-                        spin_lock(&acct_globals.lock);
+                        spin_lock(&acct_lock);
-                        acct_file_reopen(NULL);
+                        acct_file_reopen(acct, NULL, NULL);
-                        spin_unlock(&acct_globals.lock);
+                        spin_unlock(&acct_lock);
                }
        }
        return error;
@@ -288,10 +316,16 @@ asmlinkage long sys_acct(const char __user *name)
 */
 void acct_auto_close_mnt(struct vfsmount *m)
 {
-        spin_lock(&acct_globals.lock);
+        struct bsd_acct_struct *acct;
-        if (acct_globals.file && acct_globals.file->f_path.mnt == m)
-                acct_file_reopen(NULL);
+        spin_lock(&acct_lock);
-        spin_unlock(&acct_globals.lock);
+restart:
+        list_for_each_entry(acct, &acct_list, list)
+                if (acct->file && acct->file->f_path.mnt == m) {
+                        acct_file_reopen(acct, NULL, NULL);
+                        goto restart;
+                }
+        spin_unlock(&acct_lock);
 }
 /**
@@ -303,12 +337,31 @@ void acct_auto_close_mnt(struct vfsmount *m)
 */
 void acct_auto_close(struct super_block *sb)
 {
-        spin_lock(&acct_globals.lock);
+        struct bsd_acct_struct *acct;
-        if (acct_globals.file &&
-            acct_globals.file->f_path.mnt->mnt_sb == sb) {
+        spin_lock(&acct_lock);
-                acct_file_reopen(NULL);
+restart:
+        list_for_each_entry(acct, &acct_list, list)
+                if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+                        acct_file_reopen(acct, NULL, NULL);
+                        goto restart;
+                }
+        spin_unlock(&acct_lock);
+}
+void acct_exit_ns(struct pid_namespace *ns)
+{
+        struct bsd_acct_struct *acct;
+        spin_lock(&acct_lock);
+        acct = ns->bacct;
+        if (acct != NULL) {
+                if (acct->file != NULL)
+                        acct_file_reopen(acct, NULL, NULL);
+                kfree(acct);
        }
-        spin_unlock(&acct_globals.lock);
+        spin_unlock(&acct_lock);
 }
 /*
@@ -425,7 +478,8 @@ static u32 encode_float(u64 value)
 /*
 *  do_acct_process does all actual work. Caller holds the reference to file.
 */
-static void do_acct_process(struct pid_namespace *ns, struct file *file)
+static void do_acct_process(struct bsd_acct_struct *acct,
+                struct pid_namespace *ns, struct file *file)
 {
        struct pacct_struct *pacct = &current->signal->pacct;
        acct_t ac;
@@ -440,7 +494,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *file)
         * First check to see if there is enough free_space to continue
         * the process accounting system.
         */
-        if (!check_free_space(file))
+        if (!check_free_space(acct, file))
                return;
        /*
@@ -577,34 +631,46 @@ void acct_collect(long exitcode, int group_dead)
        spin_unlock_irq(&current->sighand->siglock);
 }
-/**
+static void acct_process_in_ns(struct pid_namespace *ns)
- * acct_process - now just a wrapper around do_acct_process
- * @exitcode: task exit code
- *
- * handles process accounting for an exiting task
- */
-void acct_process(void)
 {
        struct file *file = NULL;
-        struct pid_namespace *ns;
+        struct bsd_acct_struct *acct;
+        acct = ns->bacct;
        /*
         * accelerate the common fastpath:
         */
-        if (!acct_globals.file)
+        if (!acct || !acct->file)
                return;
-        spin_lock(&acct_globals.lock);
+        spin_lock(&acct_lock);
-        file = acct_globals.file;
+        file = acct->file;
        if (unlikely(!file)) {
-                spin_unlock(&acct_globals.lock);
+                spin_unlock(&acct_lock);
                return;
        }
        get_file(file);
-        ns = get_pid_ns(acct_globals.ns);
+        spin_unlock(&acct_lock);
-        spin_unlock(&acct_globals.lock);
-        do_acct_process(ns, file);
+        do_acct_process(acct, ns, file);
        fput(file);
-        put_pid_ns(ns);
+}
+/**
+ * acct_process - now just a wrapper around acct_process_in_ns,
+ * which in turn is a wrapper around do_acct_process.
+ *
+ * handles process accounting for an exiting task
+ */
+void acct_process(void)
+{
+        struct pid_namespace *ns;
+        /*
+         * This loop is safe lockless, since current is still
+         * alive and holds its namespace, which in turn holds
+         * its parent.
+         */
+        for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
+                acct_process_in_ns(ns);
 }
diff --git a/kernel/audit.c b/kernel/audit.c
index e092f1c0ce30..4414e93d8750 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -707,12 +707,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (status_get->mask & AUDIT_STATUS_ENABLED) {
                        err = audit_set_enabled(status_get->enabled,
                                                loginuid, sessionid, sid);
-                        if (err < 0) return err;
+                        if (err < 0)
+                                return err;
                }
                if (status_get->mask & AUDIT_STATUS_FAILURE) {
                        err = audit_set_failure(status_get->failure,
                                                loginuid, sessionid, sid);
-                        if (err < 0) return err;
+                        if (err < 0)
+                                return err;
                }
                if (status_get->mask & AUDIT_STATUS_PID) {
                        int new_pid = status_get->pid;
@@ -725,9 +727,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        audit_pid = new_pid;
                        audit_nlk_pid = NETLINK_CB(skb).pid;
                }
-                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
                        err = audit_set_rate_limit(status_get->rate_limit,
                                                   loginuid, sessionid, sid);
+                        if (err < 0)
+                                return err;
+                }
                if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
                        err = audit_set_backlog_limit(status_get->backlog_limit,
                                                      loginuid, sessionid, sid);
@@ -1366,7 +1371,7 @@ int audit_string_contains_control(const char *string, size_t len)
 {
        const unsigned char *p;
        for (p = string; p < (const unsigned char *)string + len && *p; p++) {
-                if (*p == '"' || *p < 0x21 || *p > 0x7f)
+                if (*p == '"' || *p < 0x21 || *p > 0x7e)
                        return 1;
        }
        return 0;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 98c50cc671bb..b7d354e2b0ef 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1022,8 +1022,11 @@ static void audit_update_watch(struct audit_parent *parent,
                        struct audit_buffer *ab;
                        ab = audit_log_start(NULL, GFP_KERNEL,
                                AUDIT_CONFIG_CHANGE);
+                        audit_log_format(ab, "auid=%u ses=%u",
+                                audit_get_loginuid(current),
+                                audit_get_sessionid(current));
                        audit_log_format(ab,
-                                "op=updated rules specifying path=");
+                                " op=updated rules specifying path=");
                        audit_log_untrustedstring(ab, owatch->path);
                        audit_log_format(ab, " with dev=%u ino=%lu\n",
                                 dev, ino);
@@ -1058,7 +1061,10 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
                                struct audit_buffer *ab;
                                ab = audit_log_start(NULL, GFP_KERNEL,
                                        AUDIT_CONFIG_CHANGE);
-                                audit_log_format(ab, "op=remove rule path=");
+                                audit_log_format(ab, "auid=%u ses=%u",
+                                        audit_get_loginuid(current),
+                                        audit_get_sessionid(current));
+                                audit_log_format(ab, " op=remove rule path=");
                                audit_log_untrustedstring(ab, w->path);
                                if (r->filterkey) {
                                        audit_log_format(ab, " key=");
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c10e7aae04d7..972f8e61d36a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -243,6 +243,9 @@ static inline int open_arg(int flags, int mask)
 static int audit_match_perm(struct audit_context *ctx, int mask)
 {
+        if (unlikely(!ctx))
+                return 0;
        unsigned n = ctx->major;
        switch (audit_classify_syscall(ctx->arch, n)) {
        case 0: /* native */
@@ -284,6 +287,10 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
 {
        unsigned index = which & ~S_IFMT;
        mode_t mode = which & S_IFMT;
+        if (unlikely(!ctx))
+                return 0;
        if (index >= ctx->name_count)
                return 0;
        if (ctx->names[index].ino == -1)
@@ -610,7 +617,7 @@ static int audit_filter_rules(struct task_struct *tsk,
                if (!result)
                        return 0;
        }
-        if (rule->filterkey)
+        if (rule->filterkey && ctx)
                ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
        switch (rule->action) {
        case AUDIT_NEVER:    *state = AUDIT_DISABLED;       break;
@@ -1476,7 +1483,8 @@ void audit_syscall_entry(int arch, int major,
        struct audit_context *context = tsk->audit_context;
        enum audit_state     state;
-        BUG_ON(!context);
+        if (unlikely(!context))
+                return;
        /*
         * This happens only on certain architectures that make system
@@ -2374,7 +2382,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
        struct audit_context *ctx = tsk->audit_context;
        if (audit_pid && t->tgid == audit_pid) {
-                if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
+                if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
                        audit_sig_pid = tsk->pid;
                        if (tsk->loginuid != -1)
                                audit_sig_uid = tsk->loginuid;
diff --git a/kernel/capability.c b/kernel/capability.c
index 901e0fdc3fff..0101e847603e 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -115,11 +115,208 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
        return 0;
 }
+#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
+/*
+ * Without filesystem capability support, we nominally support one process
+ * setting the capabilities of another
+ */
+static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
+                                     kernel_cap_t *pIp, kernel_cap_t *pPp)
+{
+        struct task_struct *target;
+        int ret;
+        spin_lock(&task_capability_lock);
+        read_lock(&tasklist_lock);
+        if (pid && pid != task_pid_vnr(current)) {
+                target = find_task_by_vpid(pid);
+                if (!target) {
+                        ret = -ESRCH;
+                        goto out;
+                }
+        } else
+                target = current;
+        ret = security_capget(target, pEp, pIp, pPp);
+out:
+        read_unlock(&tasklist_lock);
+        spin_unlock(&task_capability_lock);
+        return ret;
+}
+/*
+ * cap_set_pg - set capabilities for all processes in a given process
+ * group.  We call this holding task_capability_lock and tasklist_lock.
+ */
+static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
+                             kernel_cap_t *inheritable,
+                             kernel_cap_t *permitted)
+{
+        struct task_struct *g, *target;
+        int ret = -EPERM;
+        int found = 0;
+        struct pid *pgrp;
+        spin_lock(&task_capability_lock);
+        read_lock(&tasklist_lock);
+        pgrp = find_vpid(pgrp_nr);
+        do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
+                target = g;
+                while_each_thread(g, target) {
+                        if (!security_capset_check(target, effective,
+                                                   inheritable, permitted)) {
+                                security_capset_set(target, effective,
+                                                    inheritable, permitted);
+                                ret = 0;
+                        }
+                        found = 1;
+                }
+        } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
+        read_unlock(&tasklist_lock);
+        spin_unlock(&task_capability_lock);
+        if (!found)
+                ret = 0;
+        return ret;
+}
 /*
- * For sys_getproccap() and sys_setproccap(), any of the three
+ * cap_set_all - set capabilities for all processes other than init
- * capability set pointers may be NULL -- indicating that that set is
+ * and self.  We call this holding task_capability_lock and tasklist_lock.
- * uninteresting and/or not to be changed.
 */
+static inline int cap_set_all(kernel_cap_t *effective,
+                              kernel_cap_t *inheritable,
+                              kernel_cap_t *permitted)
+{
+        struct task_struct *g, *target;
+        int ret = -EPERM;
+        int found = 0;
+        spin_lock(&task_capability_lock);
+        read_lock(&tasklist_lock);
+        do_each_thread(g, target) {
+                if (target == current
+                    || is_container_init(target->group_leader))
+                        continue;
+                found = 1;
+                if (security_capset_check(target, effective, inheritable,
+                                          permitted))
+                        continue;
+                ret = 0;
+                security_capset_set(target, effective, inheritable, permitted);
+        } while_each_thread(g, target);
+        read_unlock(&tasklist_lock);
+        spin_unlock(&task_capability_lock);
+        if (!found)
+                ret = 0;
+        return ret;
+}
+/*
+ * Given the target pid does not refer to the current process we
+ * need more elaborate support... (This support is not present when
+ * filesystem capabilities are configured.)
+ */
+static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
+                                            kernel_cap_t *inheritable,
+                                            kernel_cap_t *permitted)
+{
+        struct task_struct *target;
+        int ret;
+        if (!capable(CAP_SETPCAP))
+                return -EPERM;
+        if (pid == -1)            /* all procs other than current and init */
+                return cap_set_all(effective, inheritable, permitted);
+        else if (pid < 0)                    /* all procs in process group */
+                return cap_set_pg(-pid, effective, inheritable, permitted);
+        /* target != current */
+        spin_lock(&task_capability_lock);
+        read_lock(&tasklist_lock);
+        target = find_task_by_vpid(pid);
+        if (!target)
+                ret = -ESRCH;
+        else {
+                ret = security_capset_check(target, effective, inheritable,
+                                            permitted);
+                /* having verified that the proposed changes are legal,
+                   we now put them into effect. */
+                if (!ret)
+                        security_capset_set(target, effective, inheritable,
+                                            permitted);
+        }
+        read_unlock(&tasklist_lock);
+        spin_unlock(&task_capability_lock);
+        return ret;
+}
+#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
+/*
+ * If we have configured with filesystem capability support, then the
+ * only thing that can change the capabilities of the current process
+ * is the current process. As such, we can't be in this code at the
+ * same time as we are in the process of setting capabilities in this
+ * process. The net result is that we can limit our use of locks to
+ * when we are reading the caps of another process.
+ */
+static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
+                                     kernel_cap_t *pIp, kernel_cap_t *pPp)
+{
+        int ret;
+        if (pid && (pid != task_pid_vnr(current))) {
+                struct task_struct *target;
+                spin_lock(&task_capability_lock);
+                read_lock(&tasklist_lock);
+                target = find_task_by_vpid(pid);
+                if (!target)
+                        ret = -ESRCH;
+                else
+                        ret = security_capget(target, pEp, pIp, pPp);
+                read_unlock(&tasklist_lock);
+                spin_unlock(&task_capability_lock);
+        } else
+                ret = security_capget(current, pEp, pIp, pPp);
+        return ret;
+}
+/*
+ * With filesystem capability support configured, the kernel does not
+ * permit the changing of capabilities in one process by another
+ * process. (CAP_SETPCAP has much less broad semantics when configured
+ * this way.)
+ */
+static inline int do_sys_capset_other_tasks(pid_t pid,
+                                            kernel_cap_t *effective,
+                                            kernel_cap_t *inheritable,
+                                            kernel_cap_t *permitted)
+{
+        return -EPERM;
+}
+#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
 /*
 * Atomically modify the effective capabilities returning the original
@@ -155,7 +352,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 {
        int ret = 0;
        pid_t pid;
-        struct task_struct *target;
        unsigned tocopy;
        kernel_cap_t pE, pI, pP;
@@ -169,23 +365,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
        if (pid < 0)
                return -EINVAL;
-        spin_lock(&task_capability_lock);
+        ret = cap_get_target_pid(pid, &pE, &pI, &pP);
-        read_lock(&tasklist_lock);
-        if (pid && pid != task_pid_vnr(current)) {
-                target = find_task_by_vpid(pid);
-                if (!target) {
-                        ret = -ESRCH;
-                        goto out;
-                }
-        } else
-                target = current;
-        ret = security_capget(target, &pE, &pI, &pP);
-out:
-        read_unlock(&tasklist_lock);
-        spin_unlock(&task_capability_lock);
        if (!ret) {
                struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
@@ -216,7 +396,6 @@ out:
                 * before modification is attempted and the application
                 * fails.
                 */
                if (copy_to_user(dataptr, kdata, tocopy
                                 * sizeof(struct __user_cap_data_struct))) {
                        return -EFAULT;
@@ -226,70 +405,8 @@ out:
        return ret;
 }
-/*
- * cap_set_pg - set capabilities for all processes in a given process
- * group.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
-                              kernel_cap_t *inheritable,
-                              kernel_cap_t *permitted)
-{
-        struct task_struct *g, *target;
-        int ret = -EPERM;
-        int found = 0;
-        struct pid *pgrp;
-        pgrp = find_vpid(pgrp_nr);
-        do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
-                target = g;
-                while_each_thread(g, target) {
-                        if (!security_capset_check(target, effective,
-                                                        inheritable,
-                                                        permitted)) {
-                                security_capset_set(target, effective,
-                                                        inheritable,
-                                                        permitted);
-                                ret = 0;
-                        }
-                        found = 1;
-                }
-        } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
-        if (!found)
-                ret = 0;
-        return ret;
-}
-/*
- * cap_set_all - set capabilities for all processes other than init
- * and self.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_all(kernel_cap_t *effective,
-                               kernel_cap_t *inheritable,
-                               kernel_cap_t *permitted)
-{
-     struct task_struct *g, *target;
-     int ret = -EPERM;
-     int found = 0;
-     do_each_thread(g, target) {
-             if (target == current || is_container_init(target->group_leader))
-                     continue;
-             found = 1;
-             if (security_capset_check(target, effective, inheritable,
-                                                permitted))
-                     continue;
-             ret = 0;
-             security_capset_set(target, effective, inheritable, permitted);
-     } while_each_thread(g, target);
-     if (!found)
-             ret = 0;
-     return ret;
-}
 /**
- * sys_capset - set capabilities for a process or a group of processes
+ * sys_capset - set capabilities for a process or (*) a group of processes
 * @header: pointer to struct that contains capability version and
 *      target pid data
 * @data: pointer to struct that contains the effective, permitted,
@@ -313,7 +430,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
        struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
        unsigned i, tocopy;
        kernel_cap_t inheritable, permitted, effective;
-        struct task_struct *target;
        int ret;
        pid_t pid;
@@ -324,9 +440,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
        if (get_user(pid, &header->pid))
                return -EFAULT;
-        if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
-                return -EPERM;
        if (copy_from_user(&kdata, data, tocopy
                           * sizeof(struct __user_cap_data_struct))) {
                return -EFAULT;
@@ -344,40 +457,31 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
                i++;
        }
-        spin_lock(&task_capability_lock);
+        if (pid && (pid != task_pid_vnr(current)))
-        read_lock(&tasklist_lock);
+                ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
+                                                &permitted);
-        if (pid > 0 && pid != task_pid_vnr(current)) {
+        else {
-                target = find_task_by_vpid(pid);
+                /*
-                if (!target) {
+                 * This lock is required even when filesystem
-                        ret = -ESRCH;
+                 * capability support is configured - it protects the
-                        goto out;
+                 * sys_capget() call from returning incorrect data in
-                }
+                 * the case that the targeted process is not the
-        } else
+                 * current one.
-                target = current;
+                 */
+                spin_lock(&task_capability_lock);
-        ret = 0;
-        /* having verified that the proposed changes are legal,
-           we now put them into effect. */
-        if (pid < 0) {
-                if (pid == -1)  /* all procs other than current and init */
-                        ret = cap_set_all(&effective, &inheritable, &permitted);
-                else            /* all procs in process group */
+                ret = security_capset_check(current, &effective, &inheritable,
-                        ret = cap_set_pg(-pid, &effective, &inheritable,
-                                         &permitted);
-        } else {
-                ret = security_capset_check(target, &effective, &inheritable,
                                            &permitted);
+                /*
+                 * Having verified that the proposed changes are
+                 * legal, we now put them into effect.
+                 */
                if (!ret)
-                        security_capset_set(target, &effective, &inheritable,
+                        security_capset_set(current, &effective, &inheritable,
                                            &permitted);
+                spin_unlock(&task_capability_lock);
        }
-out:
-        read_unlock(&tasklist_lock);
-        spin_unlock(&task_capability_lock);
        return ret;
 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 15ac0e1e4f4d..13932abde159 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,6 +45,7 @@
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
+#include <linux/namei.h>
 #include <asm/atomic.h>
@@ -89,11 +90,7 @@ struct cgroupfs_root {
        /* Hierarchy-specific flags */
        unsigned long flags;
-        /* The path to use for release notifications. No locking
+        /* The path to use for release notifications. */
-         * between setting and use - so if userspace updates this
-         * while child cgroups exist, you could miss a
-         * notification. We ensure that it's always a valid
-         * NUL-terminated string */
        char release_agent_path[PATH_MAX];
 };
@@ -118,7 +115,7 @@ static int root_count;
 * extra work in the fork/exit path if none of the subsystems need to
 * be called.
 */
-static int need_forkexit_callback;
+static int need_forkexit_callback __read_mostly;
 static int need_mm_owner_callback __read_mostly;
 /* convenient tests for these bits */
@@ -220,7 +217,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
 * task until after the first call to cgroup_iter_start(). This
 * reduces the fork()/exit() overhead for people who have cgroups
 * compiled into their kernel but not actually in use */
-static int use_task_css_set_links;
+static int use_task_css_set_links __read_mostly;
 /* When we create or destroy a css_set, the operation simply
 * takes/releases a reference count on all the cgroups referenced
@@ -241,17 +238,20 @@ static int use_task_css_set_links;
 */
 static void unlink_css_set(struct css_set *cg)
 {
+        struct cg_cgroup_link *link;
+        struct cg_cgroup_link *saved_link;
        write_lock(&css_set_lock);
        hlist_del(&cg->hlist);
        css_set_count--;
-        while (!list_empty(&cg->cg_links)) {
-                struct cg_cgroup_link *link;
+        list_for_each_entry_safe(link, saved_link, &cg->cg_links,
-                link = list_entry(cg->cg_links.next,
+                                 cg_link_list) {
-                                  struct cg_cgroup_link, cg_link_list);
                list_del(&link->cg_link_list);
                list_del(&link->cgrp_link_list);
                kfree(link);
        }
        write_unlock(&css_set_lock);
 }
@@ -355,6 +355,17 @@ static struct css_set *find_existing_css_set(
        return NULL;
 }
+static void free_cg_links(struct list_head *tmp)
+{
+        struct cg_cgroup_link *link;
+        struct cg_cgroup_link *saved_link;
+        list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
+                list_del(&link->cgrp_link_list);
+                kfree(link);
+        }
+}
 /*
 * allocate_cg_links() allocates "count" cg_cgroup_link structures
 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
@@ -368,13 +379,7 @@ static int allocate_cg_links(int count, struct list_head *tmp)
        for (i = 0; i < count; i++) {
                link = kmalloc(sizeof(*link), GFP_KERNEL);
                if (!link) {
-                        while (!list_empty(tmp)) {
+                        free_cg_links(tmp);
-                                link = list_entry(tmp->next,
-                                                  struct cg_cgroup_link,
-                                                  cgrp_link_list);
-                                list_del(&link->cgrp_link_list);
-                                kfree(link);
-                        }
                        return -ENOMEM;
                }
                list_add(&link->cgrp_link_list, tmp);
@@ -382,18 +387,6 @@ static int allocate_cg_links(int count, struct list_head *tmp)
        return 0;
 }
-static void free_cg_links(struct list_head *tmp)
-{
-        while (!list_empty(tmp)) {
-                struct cg_cgroup_link *link;
-                link = list_entry(tmp->next,
-                                  struct cg_cgroup_link,
-                                  cgrp_link_list);
-                list_del(&link->cgrp_link_list);
-                kfree(link);
-        }
-}
 /*
 * find_css_set() takes an existing cgroup group and a
 * cgroup object, and returns a css_set object that's
@@ -415,11 +408,11 @@ static struct css_set *find_css_set(
        /* First see if we already have a cgroup group that matches
         * the desired set */
-        write_lock(&css_set_lock);
+        read_lock(&css_set_lock);
        res = find_existing_css_set(oldcg, cgrp, template);
        if (res)
                get_css_set(res);
-        write_unlock(&css_set_lock);
+        read_unlock(&css_set_lock);
        if (res)
                return res;
@@ -507,10 +500,6 @@ static struct css_set *find_css_set(
 * knows that the cgroup won't be removed, as cgroup_rmdir()
 * needs that mutex.
 *
- * The cgroup_common_file_write handler for operations that modify
- * the cgroup hierarchy holds cgroup_mutex across the entire operation,
- * single threading all such cgroup modifications across the system.
- *
 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
 * (usually) take cgroup_mutex.  These are the two most performance
 * critical pieces of code here.  The exception occurs on cgroup_exit(),
@@ -962,7 +951,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        struct super_block *sb;
        struct cgroupfs_root *root;
        struct list_head tmp_cg_links;
-        INIT_LIST_HEAD(&tmp_cg_links);
        /* First find the desired set of subsystems */
        ret = parse_cgroupfs_options(data, &opts);
@@ -1093,6 +1081,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
        struct cgroupfs_root *root = sb->s_fs_info;
        struct cgroup *cgrp = &root->top_cgroup;
        int ret;
+        struct cg_cgroup_link *link;
+        struct cg_cgroup_link *saved_link;
        BUG_ON(!root);
@@ -1112,10 +1102,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
         * root cgroup
         */
        write_lock(&css_set_lock);
-        while (!list_empty(&cgrp->css_sets)) {
-                struct cg_cgroup_link *link;
+        list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
-                link = list_entry(cgrp->css_sets.next,
+                                 cgrp_link_list) {
-                                  struct cg_cgroup_link, cgrp_link_list);
                list_del(&link->cg_link_list);
                list_del(&link->cgrp_link_list);
                kfree(link);
@@ -1281,18 +1270,14 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 }
 /*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with
+ * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * cgroup_mutex, may take task_lock of task
+ * held. May take task_lock of task
 */
-static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 {
-        pid_t pid;
        struct task_struct *tsk;
        int ret;
-        if (sscanf(pidbuf, "%d", &pid) != 1)
-                return -EIO;
        if (pid) {
                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
@@ -1318,6 +1303,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
        return ret;
 }
+static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
+{
+        int ret;
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
+        ret = attach_task_by_pid(cgrp, pid);
+        cgroup_unlock();
+        return ret;
+}
 /* The various types of files and directories in a cgroup file system */
 enum cgroup_filetype {
        FILE_ROOT,
@@ -1327,12 +1322,54 @@ enum cgroup_filetype {
        FILE_RELEASE_AGENT,
 };
+/**
+ * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
+ * @cgrp: the cgroup to be checked for liveness
+ *
+ * On success, returns true; the lock should be later released with
+ * cgroup_unlock(). On failure returns false with no lock held.
+ */
+bool cgroup_lock_live_group(struct cgroup *cgrp)
+{
+        mutex_lock(&cgroup_mutex);
+        if (cgroup_is_removed(cgrp)) {
+                mutex_unlock(&cgroup_mutex);
+                return false;
+        }
+        return true;
+}
+static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buffer)
+{
+        BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
+        strcpy(cgrp->root->release_agent_path, buffer);
+        cgroup_unlock();
+        return 0;
+}
+static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
+                                     struct seq_file *seq)
+{
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
+        seq_puts(seq, cgrp->root->release_agent_path);
+        seq_putc(seq, '\n');
+        cgroup_unlock();
+        return 0;
+}
+/* A buffer size big enough for numbers or short strings */
+#define CGROUP_LOCAL_BUFFER_SIZE 64
 static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
                                struct file *file,
                                const char __user *userbuf,
                                size_t nbytes, loff_t *unused_ppos)
 {
-        char buffer[64];
+        char buffer[CGROUP_LOCAL_BUFFER_SIZE];
        int retval = 0;
        char *end;
@@ -1361,68 +1398,39 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
        return retval;
 }
-static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
+static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
-                                           struct cftype *cft,
+                                   struct file *file,
-                                           struct file *file,
+                                   const char __user *userbuf,
-                                           const char __user *userbuf,
+                                   size_t nbytes, loff_t *unused_ppos)
-                                           size_t nbytes, loff_t *unused_ppos)
 {
-        enum cgroup_filetype type = cft->private;
+        char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
-        char *buffer;
        int retval = 0;
+        size_t max_bytes = cft->max_write_len;
+        char *buffer = local_buffer;
-        if (nbytes >= PATH_MAX)
+        if (!max_bytes)
+                max_bytes = sizeof(local_buffer) - 1;
+        if (nbytes >= max_bytes)
                return -E2BIG;
+        /* Allocate a dynamic buffer if we need one */
-        /* +1 for nul-terminator */
+        if (nbytes >= sizeof(local_buffer)) {
-        buffer = kmalloc(nbytes + 1, GFP_KERNEL);
+                buffer = kmalloc(nbytes + 1, GFP_KERNEL);
-        if (buffer == NULL)
+                if (buffer == NULL)
-                return -ENOMEM;
+                        return -ENOMEM;
+        }
-        if (copy_from_user(buffer, userbuf, nbytes)) {
+        if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
                retval = -EFAULT;
-                goto out1;
+                goto out;
        }
-        buffer[nbytes] = 0;     /* nul-terminate */
-        strstrip(buffer);       /* strip -just- trailing whitespace */
-        mutex_lock(&cgroup_mutex);
+        buffer[nbytes] = 0;     /* nul-terminate */
+        strstrip(buffer);
-        /*
+        retval = cft->write_string(cgrp, cft, buffer);
-         * This was already checked for in cgroup_file_write(), but
+        if (!retval)
-         * check again now we're holding cgroup_mutex.
-         */
-        if (cgroup_is_removed(cgrp)) {
-                retval = -ENODEV;
-                goto out2;
-        }
-        switch (type) {
-        case FILE_TASKLIST:
-                retval = attach_task_by_pid(cgrp, buffer);
-                break;
-        case FILE_NOTIFY_ON_RELEASE:
-                clear_bit(CGRP_RELEASABLE, &cgrp->flags);
-                if (simple_strtoul(buffer, NULL, 10) != 0)
-                        set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-                else
-                        clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-                break;
-        case FILE_RELEASE_AGENT:
-                BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-                strcpy(cgrp->root->release_agent_path, buffer);
-                break;
-        default:
-                retval = -EINVAL;
-                goto out2;
-        }
-        if (retval == 0)
                retval = nbytes;
-out2:
+out:
-        mutex_unlock(&cgroup_mutex);
+        if (buffer != local_buffer)
-out1:
+                kfree(buffer);
-        kfree(buffer);
        return retval;
 }
@@ -1438,6 +1446,8 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
                return cft->write(cgrp, cft, file, buf, nbytes, ppos);
        if (cft->write_u64 || cft->write_s64)
                return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
+        if (cft->write_string)
+                return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
        if (cft->trigger) {
                int ret = cft->trigger(cgrp, (unsigned int)cft->private);
                return ret ? ret : nbytes;
@@ -1450,7 +1460,7 @@ static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
                               char __user *buf, size_t nbytes,
                               loff_t *ppos)
 {
-        char tmp[64];
+        char tmp[CGROUP_LOCAL_BUFFER_SIZE];
        u64 val = cft->read_u64(cgrp, cft);
        int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
@@ -1462,56 +1472,13 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
                               char __user *buf, size_t nbytes,
                               loff_t *ppos)
 {
-        char tmp[64];
+        char tmp[CGROUP_LOCAL_BUFFER_SIZE];
        s64 val = cft->read_s64(cgrp, cft);
        int len = sprintf(tmp, "%lld\n", (long long) val);
        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
-static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
-                                          struct cftype *cft,
-                                          struct file *file,
-                                          char __user *buf,
-                                          size_t nbytes, loff_t *ppos)
-{
-        enum cgroup_filetype type = cft->private;
-        char *page;
-        ssize_t retval = 0;
-        char *s;
-        if (!(page = (char *)__get_free_page(GFP_KERNEL)))
-                return -ENOMEM;
-        s = page;
-        switch (type) {
-        case FILE_RELEASE_AGENT:
-        {
-                struct cgroupfs_root *root;
-                size_t n;
-                mutex_lock(&cgroup_mutex);
-                root = cgrp->root;
-                n = strnlen(root->release_agent_path,
-                            sizeof(root->release_agent_path));
-                n = min(n, (size_t) PAGE_SIZE);
-                strncpy(s, root->release_agent_path, n);
-                mutex_unlock(&cgroup_mutex);
-                s += n;
-                break;
-        }
-        default:
-                retval = -EINVAL;
-                goto out;
-        }
-        *s++ = '\n';
-        retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
-out:
-        free_page((unsigned long)page);
-        return retval;
-}
 static ssize_t cgroup_file_read(struct file *file, char __user *buf,
                                   size_t nbytes, loff_t *ppos)
 {
@@ -1560,7 +1527,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
        return cft->read_seq_string(state->cgroup, cft, m);
 }
-int cgroup_seqfile_release(struct inode *inode, struct file *file)
+static int cgroup_seqfile_release(struct inode *inode, struct file *file)
 {
        struct seq_file *seq = file->private_data;
        kfree(seq->private);
@@ -1569,6 +1536,7 @@ int cgroup_seqfile_release(struct inode *inode, struct file *file)
 static struct file_operations cgroup_seqfile_operations = {
        .read = seq_read,
+        .write = cgroup_file_write,
        .llseek = seq_lseek,
        .release = cgroup_seqfile_release,
 };
@@ -1756,15 +1724,11 @@ int cgroup_add_files(struct cgroup *cgrp,
 int cgroup_task_count(const struct cgroup *cgrp)
 {
        int count = 0;
-        struct list_head *l;
+        struct cg_cgroup_link *link;
        read_lock(&css_set_lock);
-        l = cgrp->css_sets.next;
+        list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
-        while (l != &cgrp->css_sets) {
-                struct cg_cgroup_link *link =
-                        list_entry(l, struct cg_cgroup_link, cgrp_link_list);
                count += atomic_read(&link->cg->ref.refcount);
-                l = l->next;
        }
        read_unlock(&css_set_lock);
        return count;
@@ -2227,6 +2191,18 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
        return notify_on_release(cgrp);
 }
+static int cgroup_write_notify_on_release(struct cgroup *cgrp,
+                                          struct cftype *cft,
+                                          u64 val)
+{
+        clear_bit(CGRP_RELEASABLE, &cgrp->flags);
+        if (val)
+                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+        else
+                clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+        return 0;
+}
 /*
 * for the common functions, 'private' gives the type of file
 */
@@ -2235,7 +2211,7 @@ static struct cftype files[] = {
                .name = "tasks",
                .open = cgroup_tasks_open,
                .read = cgroup_tasks_read,
-                .write = cgroup_common_file_write,
+                .write_u64 = cgroup_tasks_write,
                .release = cgroup_tasks_release,
                .private = FILE_TASKLIST,
        },
@@ -2243,15 +2219,16 @@ static struct cftype files[] = {
        {
                .name = "notify_on_release",
                .read_u64 = cgroup_read_notify_on_release,
-                .write = cgroup_common_file_write,
+                .write_u64 = cgroup_write_notify_on_release,
                .private = FILE_NOTIFY_ON_RELEASE,
        },
 };
 static struct cftype cft_release_agent = {
        .name = "release_agent",
-        .read = cgroup_common_file_read,
+        .read_seq_string = cgroup_release_agent_show,
-        .write = cgroup_common_file_write,
+        .write_string = cgroup_release_agent_write,
+        .max_write_len = PATH_MAX,
        .private = FILE_RELEASE_AGENT,
 };
@@ -2391,7 +2368,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
-static inline int cgroup_has_css_refs(struct cgroup *cgrp)
+static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
        /* Check the reference count on each subsystem. Since we
         * already established that there are no tasks in the
@@ -2869,16 +2846,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 * cgroup_clone - clone the cgroup the given subsystem is attached to
 * @tsk: the task to be moved
 * @subsys: the given subsystem
+ * @nodename: the name for the new cgroup
 *
 * Duplicate the current cgroup in the hierarchy that the given
 * subsystem is attached to, and move this task into the new
 * child.
 */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
+int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
+                                                        char *nodename)
 {
        struct dentry *dentry;
        int ret = 0;
-        char nodename[MAX_CGROUP_TYPE_NAMELEN];
        struct cgroup *parent, *child;
        struct inode *inode;
        struct css_set *cg;
@@ -2903,8 +2881,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
        cg = tsk->cgroups;
        parent = task_cgroup(tsk, subsys->subsys_id);
-        snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid);
        /* Pin the hierarchy */
        atomic_inc(&parent->root->sb->s_active);
@@ -3078,27 +3054,24 @@ static void cgroup_release_agent(struct work_struct *work)
        while (!list_empty(&release_list)) {
                char *argv[3], *envp[3];
                int i;
-                char *pathbuf;
+                char *pathbuf = NULL, *agentbuf = NULL;
                struct cgroup *cgrp = list_entry(release_list.next,
                                                    struct cgroup,
                                                    release_list);
                list_del_init(&cgrp->release_list);
                spin_unlock(&release_list_lock);
                pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-                if (!pathbuf) {
+                if (!pathbuf)
-                        spin_lock(&release_list_lock);
+                        goto continue_free;
-                        continue;
+                if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
-                }
+                        goto continue_free;
+                agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
-                if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {
+                if (!agentbuf)
-                        kfree(pathbuf);
+                        goto continue_free;
-                        spin_lock(&release_list_lock);
-                        continue;
-                }
                i = 0;
-                argv[i++] = cgrp->root->release_agent_path;
+                argv[i++] = agentbuf;
-                argv[i++] = (char *)pathbuf;
+                argv[i++] = pathbuf;
                argv[i] = NULL;
                i = 0;
@@ -3112,8 +3085,10 @@ static void cgroup_release_agent(struct work_struct *work)
                 * be a slow process */
                mutex_unlock(&cgroup_mutex);
                call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-                kfree(pathbuf);
                mutex_lock(&cgroup_mutex);
+ continue_free:
+                kfree(pathbuf);
+                kfree(agentbuf);
                spin_lock(&release_list_lock);
        }
        spin_unlock(&release_list_lock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index cfb1d43ab801..e202a68d1cc1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void)
        cpu_hotplug.refcount = 0;
 }
+cpumask_t cpu_active_map;
 #ifdef CONFIG_HOTPLUG_CPU
 void get_online_cpus(void)
@@ -214,7 +216,6 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
        int err, nr_calls = 0;
-        struct task_struct *p;
        cpumask_t old_allowed, tmp;
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
@@ -247,21 +248,18 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        cpus_setall(tmp);
        cpu_clear(cpu, tmp);
        set_cpus_allowed_ptr(current, &tmp);
+        tmp = cpumask_of_cpu(cpu);
-        p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
+        err = __stop_machine(take_cpu_down, &tcd_param, &tmp);
+        if (err) {
-        if (IS_ERR(p) || cpu_online(cpu)) {
                /* CPU didn't die: tell everyone.  Can't complain. */
                if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
                                            hcpu) == NOTIFY_BAD)
                        BUG();
-                if (IS_ERR(p)) {
+                goto out_allowed;
-                        err = PTR_ERR(p);
-                        goto out_allowed;
-                }
-                goto out_thread;
        }
+        BUG_ON(cpu_online(cpu));
        /* Wait for it to sleep (leaving idle task). */
        while (!idle_cpu(cpu))
@@ -277,12 +275,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        check_for_tasks(cpu);
-out_thread:
-        err = kthread_stop(p);
 out_allowed:
        set_cpus_allowed_ptr(current, &old_allowed);
 out_release:
        cpu_hotplug_done();
+        if (!err) {
+                if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
+                                            hcpu) == NOTIFY_BAD)
+                        BUG();
+        }
        return err;
 }
@@ -291,11 +292,30 @@ int __ref cpu_down(unsigned int cpu)
        int err = 0;
        cpu_maps_update_begin();
-        if (cpu_hotplug_disabled)
+        if (cpu_hotplug_disabled) {
                err = -EBUSY;
-        else
+                goto out;
-                err = _cpu_down(cpu, 0);
+        }
+        cpu_clear(cpu, cpu_active_map);
+        /*
+         * Make sure the all cpus did the reschedule and are not
+         * using stale version of the cpu_active_map.
+         * This is not strictly necessary becuase stop_machine()
+         * that we run down the line already provides the required
+         * synchronization. But it's really a side effect and we do not
+         * want to depend on the innards of the stop_machine here.
+         */
+        synchronize_sched();
+        err = _cpu_down(cpu, 0);
+        if (cpu_online(cpu))
+                cpu_set(cpu, cpu_active_map);
+out:
        cpu_maps_update_done();
        return err;
 }
@@ -355,11 +375,18 @@ int __cpuinit cpu_up(unsigned int cpu)
        }
        cpu_maps_update_begin();
-        if (cpu_hotplug_disabled)
+        if (cpu_hotplug_disabled) {
                err = -EBUSY;
-        else
+                goto out;
-                err = _cpu_up(cpu, 0);
+        }
+        err = _cpu_up(cpu, 0);
+        if (cpu_online(cpu))
+                cpu_set(cpu, cpu_active_map);
+out:
        cpu_maps_update_done();
        return err;
 }
@@ -413,7 +440,7 @@ void __ref enable_nonboot_cpus(void)
                goto out;
        printk("Enabling non-boot CPUs ...\n");
-        for_each_cpu_mask(cpu, frozen_cpus) {
+        for_each_cpu_mask_nr(cpu, frozen_cpus) {
                error = _cpu_up(cpu, 1);
                if (!error) {
                        printk("CPU%d is up\n", cpu);
@@ -428,3 +455,28 @@ out:
 #endif /* CONFIG_PM_SLEEP_SMP */
 #endif /* CONFIG_SMP */
+/*
+ * cpu_bit_bitmap[] is a special, "compressed" data structure that
+ * represents all NR_CPUS bits binary values of 1<<nr.
+ *
+ * It is used by cpumask_of_cpu() to get a constant address to a CPU
+ * mask value that has a single bit set only.
+ */
+/* cpu_bit_bitmap[0] is empty - so we can back into it */
+#define MASK_DECLARE_1(x)       [x+1][0] = 1UL << (x)
+#define MASK_DECLARE_2(x)       MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
+#define MASK_DECLARE_4(x)       MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
+#define MASK_DECLARE_8(x)       MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
+const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
+        MASK_DECLARE_8(0),      MASK_DECLARE_8(8),
+        MASK_DECLARE_8(16),     MASK_DECLARE_8(24),
+#if BITS_PER_LONG > 32
+        MASK_DECLARE_8(32),     MASK_DECLARE_8(40),
+        MASK_DECLARE_8(48),     MASK_DECLARE_8(56),
+#endif
+};
+EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 459d601947a8..d5ab79cf516d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -54,7 +54,6 @@
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
 #include <linux/mutex.h>
-#include <linux/kfifo.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
@@ -227,10 +226,6 @@ static struct cpuset top_cpuset = {
 * The task_struct fields mems_allowed and mems_generation may only
 * be accessed in the context of that task, so require no locks.
 *
- * The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds cgroup_mutex across the entire operation,
- * single threading all such cpuset modifications across the system.
- *
 * The cpuset_common_file_read() handlers only hold callback_mutex across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
@@ -369,7 +364,7 @@ void cpuset_update_task_memory_state(void)
                my_cpusets_mem_gen = top_cpuset.mems_generation;
        } else {
                rcu_read_lock();
-                my_cpusets_mem_gen = task_cs(current)->mems_generation;
+                my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
                rcu_read_unlock();
        }
@@ -490,21 +485,51 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 static void
 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 {
-        if (!dattr)
-                return;
        if (dattr->relax_domain_level < c->relax_domain_level)
                dattr->relax_domain_level = c->relax_domain_level;
        return;
 }
+static void
+update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+        LIST_HEAD(q);
+        list_add(&c->stack_list, &q);
+        while (!list_empty(&q)) {
+                struct cpuset *cp;
+                struct cgroup *cont;
+                struct cpuset *child;
+                cp = list_first_entry(&q, struct cpuset, stack_list);
+                list_del(q.next);
+                if (cpus_empty(cp->cpus_allowed))
+                        continue;
+                if (is_sched_load_balance(cp))
+                        update_domain_attr(dattr, cp);
+                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                        child = cgroup_cs(cont);
+                        list_add_tail(&child->stack_list, &q);
+                }
+        }
+}
 /*
 * rebuild_sched_domains()
 *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * This routine will be called to rebuild the scheduler's dynamic
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * sched domains:
- * which has that flag enabled, or if any cpuset with a non-empty
+ * - if the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' is removed, then call this routine to rebuild the
+ *   'cpus' changes,
- * scheduler's dynamic sched domains.
+ * - or if the 'cpus' allowed changes in any cpuset which has that
+ *   flag enabled,
+ * - or if the 'sched_relax_domain_level' of any cpuset which has
+ *   that flag enabled and with non-empty 'cpus' changes,
+ * - or if any cpuset with non-empty 'cpus' is removed,
+ * - or if a cpu gets offlined.
 *
 * This routine builds a partial partition of the systems CPUs
 * (the set of non-overlappping cpumask_t's in the array 'part'
@@ -531,7 +556,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 * So the reverse nesting would risk an ABBA deadlock.
 *
 * The three key local variables below are:
- *    q  - a kfifo queue of cpuset pointers, used to implement a
+ *    q  - a linked-list queue of cpuset pointers, used to implement a
 *         top-down scan of all cpusets.  This scan loads a pointer
 *         to each cpuset marked is_sched_load_balance into the
 *         array 'csa'.  For our purposes, rebuilding the schedulers
@@ -564,9 +589,9 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 *      partition_sched_domains().
 */
-static void rebuild_sched_domains(void)
+void rebuild_sched_domains(void)
 {
-        struct kfifo *q;        /* queue of cpusets to be scanned */
+        LIST_HEAD(q);           /* queue of cpusets to be scanned*/
        struct cpuset *cp;      /* scans q */
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
@@ -576,7 +601,6 @@ static void rebuild_sched_domains(void)
        int ndoms;              /* number of sched domains in result */
        int nslot;              /* next empty doms[] cpumask_t slot */
-        q = NULL;
        csa = NULL;
        doms = NULL;
        dattr = NULL;
@@ -590,30 +614,42 @@ static void rebuild_sched_domains(void)
                dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
                if (dattr) {
                        *dattr = SD_ATTR_INIT;
-                        update_domain_attr(dattr, &top_cpuset);
+                        update_domain_attr_tree(dattr, &top_cpuset);
                }
                *doms = top_cpuset.cpus_allowed;
                goto rebuild;
        }
-        q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
-        if (IS_ERR(q))
-                goto done;
        csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
        if (!csa)
                goto done;
        csn = 0;
-        cp = &top_cpuset;
+        list_add(&top_cpuset.stack_list, &q);
-        __kfifo_put(q, (void *)&cp, sizeof(cp));
+        while (!list_empty(&q)) {
-        while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
                struct cgroup *cont;
                struct cpuset *child;   /* scans child cpusets of cp */
-                if (is_sched_load_balance(cp))
+                cp = list_first_entry(&q, struct cpuset, stack_list);
+                list_del(q.next);
+                if (cpus_empty(cp->cpus_allowed))
+                        continue;
+                /*
+                 * All child cpusets contain a subset of the parent's cpus, so
+                 * just skip them, and then we call update_domain_attr_tree()
+                 * to calc relax_domain_level of the corresponding sched
+                 * domain.
+                 */
+                if (is_sched_load_balance(cp)) {
                        csa[csn++] = cp;
+                        continue;
+                }
                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
                        child = cgroup_cs(cont);
-                        __kfifo_put(q, (void *)&child, sizeof(cp));
+                        list_add_tail(&child->stack_list, &q);
                }
        }
@@ -679,7 +715,9 @@ restart:
                                if (apn == b->pn) {
                                        cpus_or(*dp, *dp, b->cpus_allowed);
                                        b->pn = -1;
-                                        update_domain_attr(dattr, b);
+                                        if (dattr)
+                                                update_domain_attr_tree(dattr
+                                                                   + nslot, b);
                                }
                        }
                        nslot++;
@@ -694,43 +732,11 @@ rebuild:
        put_online_cpus();
 done:
-        if (q && !IS_ERR(q))
-                kfifo_free(q);
        kfree(csa);
        /* Don't kfree(doms) -- partition_sched_domains() does that. */
        /* Don't kfree(dattr) -- partition_sched_domains() does that. */
 }
-static inline int started_after_time(struct task_struct *t1,
-                                     struct timespec *time,
-                                     struct task_struct *t2)
-{
-        int start_diff = timespec_compare(&t1->start_time, time);
-        if (start_diff > 0) {
-                return 1;
-        } else if (start_diff < 0) {
-                return 0;
-        } else {
-                /*
-                 * Arbitrarily, if two processes started at the same
-                 * time, we'll say that the lower pointer value
-                 * started first. Note that t2 may have exited by now
-                 * so this may not be a valid pointer any longer, but
-                 * that's fine - it still serves to distinguish
-                 * between two tasks started (effectively)
-                 * simultaneously.
-                 */
-                return t1 > t2;
-        }
-}
-static inline int started_after(void *p1, void *p2)
-{
-        struct task_struct *t1 = p1;
-        struct task_struct *t2 = p2;
-        return started_after_time(t1, &t2->start_time, t2);
-}
 /**
 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
 * @tsk: task to test
@@ -766,15 +772,49 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 }
 /**
+ * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ *
+ * Called with cgroup_mutex held
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ *
+ * Return 0 if successful, -errno if not.
+ */
+static int update_tasks_cpumask(struct cpuset *cs)
+{
+        struct cgroup_scanner scan;
+        struct ptr_heap heap;
+        int retval;
+        /*
+         * cgroup_scan_tasks() will initialize heap->gt for us.
+         * heap_init() is still needed here for we should not change
+         * cs->cpus_allowed when heap_init() fails.
+         */
+        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+        if (retval)
+                return retval;
+        scan.cg = cs->css.cgroup;
+        scan.test_task = cpuset_test_cpumask;
+        scan.process_task = cpuset_change_cpumask;
+        scan.heap = &heap;
+        retval = cgroup_scan_tasks(&scan);
+        heap_free(&heap);
+        return retval;
+}
+/**
 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
 * @cs: the cpuset to consider
 * @buf: buffer of cpu numbers written to this cpuset
 */
-static int update_cpumask(struct cpuset *cs, char *buf)
+static int update_cpumask(struct cpuset *cs, const char *buf)
 {
        struct cpuset trialcs;
-        struct cgroup_scanner scan;
-        struct ptr_heap heap;
        int retval;
        int is_load_balanced;
@@ -790,7 +830,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have cpus.
         */
-        buf = strstrip(buf);
        if (!*buf) {
                cpus_clear(trialcs.cpus_allowed);
        } else {
@@ -809,10 +848,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
        if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
                return 0;
-        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
-        if (retval)
-                return retval;
        is_load_balanced = is_sched_load_balance(&trialcs);
        mutex_lock(&callback_mutex);
@@ -823,12 +858,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
         * Scan tasks in the cpuset, and update the cpumasks of any
         * that need an update.
         */
-        scan.cg = cs->css.cgroup;
+        retval = update_tasks_cpumask(cs);
-        scan.test_task = cpuset_test_cpumask;
+        if (retval < 0)
-        scan.process_task = cpuset_change_cpumask;
+                return retval;
-        scan.heap = &heap;
-        cgroup_scan_tasks(&scan);
-        heap_free(&heap);
        if (is_load_balanced)
                rebuild_sched_domains();
@@ -884,74 +916,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
        mutex_unlock(&callback_mutex);
 }
-/*
- * Handle user request to change the 'mems' memory placement
- * of a cpuset.  Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies and if
- * the cpuset is marked 'memory_migrate', migrate the tasks
- * pages to the new memory.
- *
- * Call with cgroup_mutex held.  May take callback_mutex during call.
- * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
- * lock each such tasks mm->mmap_sem, scan its vma's and rebind
- * their mempolicies to the cpusets new mems_allowed.
- */
 static void *cpuset_being_rebound;
-static int update_nodemask(struct cpuset *cs, char *buf)
+/**
+ * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
+ * @oldmem: old mems_allowed of cpuset cs
+ *
+ * Called with cgroup_mutex held
+ * Return 0 if successful, -errno if not.
+ */
+static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 {
-        struct cpuset trialcs;
-        nodemask_t oldmem;
        struct task_struct *p;
        struct mm_struct **mmarray;
        int i, n, ntasks;
        int migrate;
        int fudge;
-        int retval;
        struct cgroup_iter it;
+        int retval;
-        /*
-         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
-         * it's read-only
-         */
-        if (cs == &top_cpuset)
-                return -EACCES;
-        trialcs = *cs;
-        /*
-         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
-         * Since nodelist_parse() fails on an empty mask, we special case
-         * that parsing.  The validate_change() call ensures that cpusets
-         * with tasks have memory.
-         */
-        buf = strstrip(buf);
-        if (!*buf) {
-                nodes_clear(trialcs.mems_allowed);
-        } else {
-                retval = nodelist_parse(buf, trialcs.mems_allowed);
-                if (retval < 0)
-                        goto done;
-                if (!nodes_subset(trialcs.mems_allowed,
-                                node_states[N_HIGH_MEMORY]))
-                        return -EINVAL;
-        }
-        oldmem = cs->mems_allowed;
-        if (nodes_equal(oldmem, trialcs.mems_allowed)) {
-                retval = 0;             /* Too easy - nothing to do */
-                goto done;
-        }
-        retval = validate_change(cs, &trialcs);
-        if (retval < 0)
-                goto done;
-        mutex_lock(&callback_mutex);
-        cs->mems_allowed = trialcs.mems_allowed;
-        cs->mems_generation = cpuset_mems_generation++;
-        mutex_unlock(&callback_mutex);
        cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
@@ -1018,7 +1001,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                mpol_rebind_mm(mm, &cs->mems_allowed);
                if (migrate)
-                        cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
+                        cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
                mmput(mm);
        }
@@ -1030,6 +1013,70 @@ done:
        return retval;
 }
+/*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset.  Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies and if
+ * the cpuset is marked 'memory_migrate', migrate the tasks
+ * pages to the new memory.
+ *
+ * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
+ */
+static int update_nodemask(struct cpuset *cs, const char *buf)
+{
+        struct cpuset trialcs;
+        nodemask_t oldmem;
+        int retval;
+        /*
+         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
+         * it's read-only
+         */
+        if (cs == &top_cpuset)
+                return -EACCES;
+        trialcs = *cs;
+        /*
+         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
+         * Since nodelist_parse() fails on an empty mask, we special case
+         * that parsing.  The validate_change() call ensures that cpusets
+         * with tasks have memory.
+         */
+        if (!*buf) {
+                nodes_clear(trialcs.mems_allowed);
+        } else {
+                retval = nodelist_parse(buf, trialcs.mems_allowed);
+                if (retval < 0)
+                        goto done;
+                if (!nodes_subset(trialcs.mems_allowed,
+                                node_states[N_HIGH_MEMORY]))
+                        return -EINVAL;
+        }
+        oldmem = cs->mems_allowed;
+        if (nodes_equal(oldmem, trialcs.mems_allowed)) {
+                retval = 0;             /* Too easy - nothing to do */
+                goto done;
+        }
+        retval = validate_change(cs, &trialcs);
+        if (retval < 0)
+                goto done;
+        mutex_lock(&callback_mutex);
+        cs->mems_allowed = trialcs.mems_allowed;
+        cs->mems_generation = cpuset_mems_generation++;
+        mutex_unlock(&callback_mutex);
+        retval = update_tasks_nodemask(cs, &oldmem);
+done:
+        return retval;
+}
 int current_cpuset_is_being_rebound(void)
 {
        return task_cs(current) == cpuset_being_rebound;
@@ -1042,7 +1089,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
-                rebuild_sched_domains();
+                if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
+                        rebuild_sched_domains();
        }
        return 0;
@@ -1254,72 +1302,14 @@ typedef enum {
        FILE_SPREAD_SLAB,
 } cpuset_filetype_t;
-static ssize_t cpuset_common_file_write(struct cgroup *cont,
-                                        struct cftype *cft,
-                                        struct file *file,
-                                        const char __user *userbuf,
-                                        size_t nbytes, loff_t *unused_ppos)
-{
-        struct cpuset *cs = cgroup_cs(cont);
-        cpuset_filetype_t type = cft->private;
-        char *buffer;
-        int retval = 0;
-        /* Crude upper limit on largest legitimate cpulist user might write. */
-        if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
-                return -E2BIG;
-        /* +1 for nul-terminator */
-        buffer = kmalloc(nbytes + 1, GFP_KERNEL);
-        if (!buffer)
-                return -ENOMEM;
-        if (copy_from_user(buffer, userbuf, nbytes)) {
-                retval = -EFAULT;
-                goto out1;
-        }
-        buffer[nbytes] = 0;     /* nul-terminate */
-        cgroup_lock();
-        if (cgroup_is_removed(cont)) {
-                retval = -ENODEV;
-                goto out2;
-        }
-        switch (type) {
-        case FILE_CPULIST:
-                retval = update_cpumask(cs, buffer);
-                break;
-        case FILE_MEMLIST:
-                retval = update_nodemask(cs, buffer);
-                break;
-        default:
-                retval = -EINVAL;
-                goto out2;
-        }
-        if (retval == 0)
-                retval = nbytes;
-out2:
-        cgroup_unlock();
-out1:
-        kfree(buffer);
-        return retval;
-}
 static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
        int retval = 0;
        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
-        cgroup_lock();
+        if (!cgroup_lock_live_group(cgrp))
-        if (cgroup_is_removed(cgrp)) {
-                cgroup_unlock();
                return -ENODEV;
-        }
        switch (type) {
        case FILE_CPU_EXCLUSIVE:
@@ -1365,12 +1355,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
-        cgroup_lock();
+        if (!cgroup_lock_live_group(cgrp))
-        if (cgroup_is_removed(cgrp)) {
-                cgroup_unlock();
                return -ENODEV;
-        }
        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                retval = update_relax_domain_level(cs, val);
@@ -1384,6 +1371,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 }
 /*
+ * Common handling for a write to a "cpus" or "mems" file.
+ */
+static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
+                                const char *buf)
+{
+        int retval = 0;
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
+        switch (cft->private) {
+        case FILE_CPULIST:
+                retval = update_cpumask(cgroup_cs(cgrp), buf);
+                break;
+        case FILE_MEMLIST:
+                retval = update_nodemask(cgroup_cs(cgrp), buf);
+                break;
+        default:
+                retval = -EINVAL;
+                break;
+        }
+        cgroup_unlock();
+        return retval;
+}
+/*
 * These ascii lists should be read in a single call, by using a user
 * buffer large enough to hold the entire map.  If read in smaller
 * chunks, there is no guarantee of atomicity.  Since the display format
@@ -1502,14 +1515,16 @@ static struct cftype files[] = {
        {
                .name = "cpus",
                .read = cpuset_common_file_read,
-                .write = cpuset_common_file_write,
+                .write_string = cpuset_write_resmask,
+                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
        },
        {
                .name = "mems",
                .read = cpuset_common_file_read,
-                .write = cpuset_common_file_write,
+                .write_string = cpuset_write_resmask,
+                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
        },
@@ -1790,7 +1805,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
        scan.scan.heap = NULL;
        scan.to = to->css.cgroup;
-        if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
+        if (cgroup_scan_tasks(&scan.scan))
                printk(KERN_ERR "move_member_tasks_to_cpuset: "
                                "cgroup_scan_tasks failed\n");
 }
@@ -1846,29 +1861,29 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 */
 static void scan_for_empty_cpusets(const struct cpuset *root)
 {
+        LIST_HEAD(queue);
        struct cpuset *cp;      /* scans cpusets being updated */
        struct cpuset *child;   /* scans child cpusets of cp */
-        struct list_head queue;
        struct cgroup *cont;
+        nodemask_t oldmems;
-        INIT_LIST_HEAD(&queue);
        list_add_tail((struct list_head *)&root->stack_list, &queue);
        while (!list_empty(&queue)) {
-                cp = container_of(queue.next, struct cpuset, stack_list);
+                cp = list_first_entry(&queue, struct cpuset, stack_list);
                list_del(queue.next);
                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
                        child = cgroup_cs(cont);
                        list_add_tail(&child->stack_list, &queue);
                }
-                cont = cp->css.cgroup;
                /* Continue past cpusets with all cpus, mems online */
                if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
+                oldmems = cp->mems_allowed;
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
                cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
@@ -1880,6 +1895,10 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
                if (cpus_empty(cp->cpus_allowed) ||
                     nodes_empty(cp->mems_allowed))
                        remove_tasks_in_empty_cpuset(cp);
+                else {
+                        update_tasks_cpumask(cp);
+                        update_tasks_nodemask(cp, &oldmems);
+                }
        }
 }
@@ -1972,7 +1991,6 @@ void __init cpuset_init_smp(void)
 }
 /**
 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 10e43fd8b721..b3179dad71be 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -145,8 +145,11 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
        d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
        tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
        d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+        tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
+        d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
        d->blkio_count += tsk->delays->blkio_count;
        d->swapin_count += tsk->delays->swapin_count;
+        d->freepages_count += tsk->delays->freepages_count;
        spin_unlock_irqrestore(&tsk->delays->lock, flags);
 done:
@@ -165,3 +168,16 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
        return ret;
 }
+void __delayacct_freepages_start(void)
+{
+        delayacct_start(&current->delays->freepages_start);
+}
+void __delayacct_freepages_end(void)
+{
+        delayacct_end(&current->delays->freepages_start,
+                        &current->delays->freepages_end,
+                        &current->delays->freepages_delay,
+                        &current->delays->freepages_count);
+}
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
new file mode 100644
index 000000000000..91e96950cd52
--- /dev/null
+++ b/kernel/dma-coherent.c
@@ -0,0 +1,153 @@
+/*
+ * Coherent per-device memory handling.
+ * Borrowed from i386
+ */
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+struct dma_coherent_mem {
+        void            *virt_base;
+        u32             device_base;
+        int             size;
+        int             flags;
+        unsigned long   *bitmap;
+};
+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+                                dma_addr_t device_addr, size_t size, int flags)
+{
+        void __iomem *mem_base = NULL;
+        int pages = size >> PAGE_SHIFT;
+        int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+        if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
+                goto out;
+        if (!size)
+                goto out;
+        if (dev->dma_mem)
+                goto out;
+        /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
+        mem_base = ioremap(bus_addr, size);
+        if (!mem_base)
+                goto out;
+        dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+        if (!dev->dma_mem)
+                goto out;
+        dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+        if (!dev->dma_mem->bitmap)
+                goto free1_out;
+        dev->dma_mem->virt_base = mem_base;
+        dev->dma_mem->device_base = device_addr;
+        dev->dma_mem->size = pages;
+        dev->dma_mem->flags = flags;
+        if (flags & DMA_MEMORY_MAP)
+                return DMA_MEMORY_MAP;
+        return DMA_MEMORY_IO;
+ free1_out:
+        kfree(dev->dma_mem);
+ out:
+        if (mem_base)
+                iounmap(mem_base);
+        return 0;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+void dma_release_declared_memory(struct device *dev)
+{
+        struct dma_coherent_mem *mem = dev->dma_mem;
+        if (!mem)
+                return;
+        dev->dma_mem = NULL;
+        iounmap(mem->virt_base);
+        kfree(mem->bitmap);
+        kfree(mem);
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+void *dma_mark_declared_memory_occupied(struct device *dev,
+                                        dma_addr_t device_addr, size_t size)
+{
+        struct dma_coherent_mem *mem = dev->dma_mem;
+        int pos, err;
+        size += device_addr & ~PAGE_MASK;
+        if (!mem)
+                return ERR_PTR(-EINVAL);
+        pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
+        err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
+        if (err != 0)
+                return ERR_PTR(err);
+        return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+/**
+ * Try to allocate memory from the per-device coherent area.
+ *
+ * @dev:        device from which we allocate memory
+ * @size:       size of requested memory area
+ * @dma_handle: This will be filled with the correct dma handle
+ * @ret:        This pointer will be filled with the virtual address
+ *              to allocated area.
+ *
+ * This function should be only called from per-arch %dma_alloc_coherent()
+ * to support allocation from per-device coherent memory pools.
+ *
+ * Returns 0 if dma_alloc_coherent should continue with allocating from
+ * generic memory areas, or !0 if dma_alloc_coherent should return %ret.
+ */
+int dma_alloc_from_coherent(struct device *dev, ssize_t size,
+                                       dma_addr_t *dma_handle, void **ret)
+{
+        struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+        int order = get_order(size);
+        if (mem) {
+                int page = bitmap_find_free_region(mem->bitmap, mem->size,
+                                                     order);
+                if (page >= 0) {
+                        *dma_handle = mem->device_base + (page << PAGE_SHIFT);
+                        *ret = mem->virt_base + (page << PAGE_SHIFT);
+                        memset(*ret, 0, size);
+                } else if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+                        *ret = NULL;
+        }
+        return (mem != NULL);
+}
+/**
+ * Try to free the memory allocated from per-device coherent memory pool.
+ * @dev:        device from which the memory was allocated
+ * @order:      the order of pages allocated
+ * @vaddr:      virtual address of allocated pages
+ *
+ * This checks whether the memory was allocated from the per-device
+ * coherent memory pool and if so, releases that memory.
+ *
+ * Returns 1 if we correctly released the memory, or 0 if
+ * %dma_release_coherent() should proceed with releasing memory from
+ * generic pools.
+ */
+int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
+{
+        struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+        if (mem && vaddr >= mem->virt_base && vaddr <
+                   (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+                int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+                bitmap_release_region(mem->bitmap, page, order);
+                return 1;
+        }
+        return 0;
+}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index a9e6bad9f706..0d407e886735 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -65,7 +65,7 @@ lookup_exec_domain(u_long personality)
                                goto out;
        }
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
        read_unlock(&exec_domains_lock);
        request_module("personality-%ld", pers);
        read_lock(&exec_domains_lock);
@@ -168,7 +168,6 @@ __set_personality(u_long personality)
        current->personality = personality;
        oep = current_thread_info()->exec_domain;
        current_thread_info()->exec_domain = ep;
-        set_fs_altroot();
        module_put(oep->module);
        return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index 93d2711b9381..38ec40630149 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
 #include <linux/resource.h>
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/tracehook.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -85,7 +86,6 @@ static void __exit_signal(struct task_struct *tsk)
        BUG_ON(!sig);
        BUG_ON(!atomic_read(&sig->count));
-        rcu_read_lock();
        sighand = rcu_dereference(tsk->sighand);
        spin_lock(&sighand->siglock);
@@ -121,6 +121,7 @@ static void __exit_signal(struct task_struct *tsk)
                sig->nivcsw += tsk->nivcsw;
                sig->inblock += task_io_get_inblock(tsk);
                sig->oublock += task_io_get_oublock(tsk);
+                task_io_accounting_add(&sig->ioac, &tsk->ioac);
                sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
                sig = NULL; /* Marker for below. */
        }
@@ -136,7 +137,6 @@ static void __exit_signal(struct task_struct *tsk)
        tsk->signal = NULL;
        tsk->sighand = NULL;
        spin_unlock(&sighand->siglock);
-        rcu_read_unlock();
        __cleanup_sighand(sighand);
        clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
@@ -152,27 +152,17 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
        put_task_struct(container_of(rhp, struct task_struct, rcu));
 }
-/*
- * Do final ptrace-related cleanup of a zombie being reaped.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-static void ptrace_release_task(struct task_struct *p)
-{
-        BUG_ON(!list_empty(&p->ptraced));
-        ptrace_unlink(p);
-        BUG_ON(!list_empty(&p->ptrace_entry));
-}
 void release_task(struct task_struct * p)
 {
        struct task_struct *leader;
        int zap_leader;
 repeat:
+        tracehook_prepare_release_task(p);
        atomic_dec(&p->user->processes);
        proc_flush_task(p);
        write_lock_irq(&tasklist_lock);
-        ptrace_release_task(p);
+        tracehook_finish_release_task(p);
        __exit_signal(p);
        /*
@@ -194,6 +184,13 @@ repeat:
                 * that case.
                 */
                zap_leader = task_detached(leader);
+                /*
+                 * This maintains the invariant that release_task()
+                 * only runs on a task in EXIT_DEAD, just for sanity.
+                 */
+                if (zap_leader)
+                        leader->exit_state = EXIT_DEAD;
        }
        write_unlock_irq(&tasklist_lock);
@@ -432,7 +429,7 @@ void daemonize(const char *name, ...)
         * We don't want to have TIF_FREEZE set if the system-wide hibernation
         * or suspend transition begins right now.
         */
-        current->flags |= PF_NOFREEZE;
+        current->flags |= (PF_NOFREEZE | PF_KTHREAD);
        if (current->nsproxy != &init_nsproxy) {
                get_nsproxy(&init_nsproxy);
@@ -557,8 +554,6 @@ void put_fs_struct(struct fs_struct *fs)
        if (atomic_dec_and_test(&fs->count)) {
                path_put(&fs->root);
                path_put(&fs->pwd);
-                if (fs->altroot.dentry)
-                        path_put(&fs->altroot);
                kmem_cache_free(fs_cachep, fs);
        }
 }
@@ -666,26 +661,40 @@ assign_new_owner:
 static void exit_mm(struct task_struct * tsk)
 {
        struct mm_struct *mm = tsk->mm;
+        struct core_state *core_state;
        mm_release(tsk, mm);
        if (!mm)
                return;
        /*
         * Serialize with any possible pending coredump.
-         * We must hold mmap_sem around checking core_waiters
+         * We must hold mmap_sem around checking core_state
         * and clearing tsk->mm.  The core-inducing thread
-         * will increment core_waiters for each thread in the
+         * will increment ->nr_threads for each thread in the
         * group with ->mm != NULL.
         */
        down_read(&mm->mmap_sem);
-        if (mm->core_waiters) {
+        core_state = mm->core_state;
+        if (core_state) {
+                struct core_thread self;
                up_read(&mm->mmap_sem);
-                down_write(&mm->mmap_sem);
-                if (!--mm->core_waiters)
-                        complete(mm->core_startup_done);
-                up_write(&mm->mmap_sem);
-                wait_for_completion(&mm->core_done);
+                self.task = tsk;
+                self.next = xchg(&core_state->dumper.next, &self);
+                /*
+                 * Implies mb(), the result of xchg() must be visible
+                 * to core_state->dumper.
+                 */
+                if (atomic_dec_and_test(&core_state->nr_threads))
+                        complete(&core_state->startup);
+                for (;;) {
+                        set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+                        if (!self.task) /* see coredump_finish() */
+                                break;
+                        schedule();
+                }
+                __set_task_state(tsk, TASK_RUNNING);
                down_read(&mm->mmap_sem);
        }
        atomic_inc(&mm->mm_count);
@@ -863,7 +872,8 @@ static void forget_original_parent(struct task_struct *father)
 */
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
-        int state;
+        int signal;
+        void *cookie;
        /*
         * This does two things:
@@ -900,22 +910,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
            !capable(CAP_KILL))
                tsk->exit_signal = SIGCHLD;
-        /* If something other than our normal parent is ptracing us, then
+        signal = tracehook_notify_death(tsk, &cookie, group_dead);
-         * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
+        if (signal >= 0)
-         * only has special meaning to our real parent.
+                signal = do_notify_parent(tsk, signal);
-         */
-        if (!task_detached(tsk) && thread_group_empty(tsk)) {
-                int signal = ptrace_reparented(tsk) ?
-                                SIGCHLD : tsk->exit_signal;
-                do_notify_parent(tsk, signal);
-        } else if (tsk->ptrace) {
-                do_notify_parent(tsk, SIGCHLD);
-        }
-        state = EXIT_ZOMBIE;
+        tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
-        if (task_detached(tsk) && likely(!tsk->ptrace))
-                state = EXIT_DEAD;
-        tsk->exit_state = state;
        /* mt-exec, de_thread() is waiting for us */
        if (thread_group_leader(tsk) &&
@@ -925,8 +924,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        write_unlock_irq(&tasklist_lock);
+        tracehook_report_death(tsk, signal, cookie, group_dead);
        /* If the process is dead, release it - nobody will wait for it */
-        if (state == EXIT_DEAD)
+        if (signal == DEATH_REAP)
                release_task(tsk);
 }
@@ -1005,10 +1006,7 @@ NORET_TYPE void do_exit(long code)
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
-        if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
+        tracehook_report_exit(&code);
-                current->ptrace_message = code;
-                ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
-        }
        /*
         * We're taking recursive faults here in do_exit. Safest is to just
@@ -1354,6 +1352,8 @@ static int wait_task_zombie(struct task_struct *p, int options,
                psig->coublock +=
                        task_io_get_oublock(p) +
                        sig->oublock + sig->coublock;
+                task_io_accounting_add(&psig->ioac, &p->ioac);
+                task_io_accounting_add(&psig->ioac, &sig->ioac);
                spin_unlock_irq(&p->parent->sighand->siglock);
        }
diff --git a/kernel/fork.c b/kernel/fork.c
index adefc1131f27..7ce2ebe84796 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -27,15 +27,18 @@
 #include <linux/key.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
+#include <linux/mmu_notifier.h>
 #include <linux/fs.h>
 #include <linux/nsproxy.h>
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
 #include <linux/security.h>
+#include <linux/hugetlb.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
+#include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
@@ -92,6 +95,23 @@ int nr_processes(void)
 static struct kmem_cache *task_struct_cachep;
 #endif
+#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+{
+#ifdef CONFIG_DEBUG_STACK_USAGE
+        gfp_t mask = GFP_KERNEL | __GFP_ZERO;
+#else
+        gfp_t mask = GFP_KERNEL;
+#endif
+        return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+}
+static inline void free_thread_info(struct thread_info *ti)
+{
+        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+}
+#endif
 /* SLAB cache for signal_struct structures (tsk->signal) */
 static struct kmem_cache *signal_cachep;
@@ -307,6 +327,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                }
                /*
+                 * Clear hugetlb-related page reserves for children. This only
+                 * affects MAP_PRIVATE mappings. Faults generated by the child
+                 * are not guaranteed to succeed, even if read-only
+                 */
+                if (is_vm_hugetlb_page(tmp))
+                        reset_vma_resv_huge_pages(tmp);
+                /*
                 * Link in the new vma and copy the page table entries.
                 */
                *pprev = tmp;
@@ -374,7 +402,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        INIT_LIST_HEAD(&mm->mmlist);
        mm->flags = (current->mm) ? current->mm->flags
                                  : MMF_DUMP_FILTER_DEFAULT;
-        mm->core_waiters = 0;
+        mm->core_state = NULL;
        mm->nr_ptes = 0;
        set_mm_counter(mm, file_rss, 0);
        set_mm_counter(mm, anon_rss, 0);
@@ -387,6 +415,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
+                mmu_notifier_mm_init(mm);
                return mm;
        }
@@ -419,6 +448,7 @@ void __mmdrop(struct mm_struct *mm)
        BUG_ON(mm == &init_mm);
        mm_free_pgd(mm);
        destroy_context(mm);
+        mmu_notifier_mm_destroy(mm);
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -448,7 +478,7 @@ EXPORT_SYMBOL_GPL(mmput);
 /**
 * get_task_mm - acquire a reference to the task's mm
 *
- * Returns %NULL if the task has no mm.  Checks PF_BORROWED_MM (meaning
+ * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
 * this kernel workthread has transiently adopted a user mm with use_mm,
 * to do its AIO) is not set and if so returns a reference to it, after
 * bumping up the use count.  User must release the mm via mmput()
@@ -461,7 +491,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
        task_lock(task);
        mm = task->mm;
        if (mm) {
-                if (task->flags & PF_BORROWED_MM)
+                if (task->flags & PF_KTHREAD)
                        mm = NULL;
                else
                        atomic_inc(&mm->mm_users);
@@ -630,13 +660,6 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
                path_get(&old->root);
                fs->pwd = old->pwd;
                path_get(&old->pwd);
-                if (old->altroot.dentry) {
-                        fs->altroot = old->altroot;
-                        path_get(&old->altroot);
-                } else {
-                        fs->altroot.mnt = NULL;
-                        fs->altroot.dentry = NULL;
-                }
                read_unlock(&old->lock);
        }
        return fs;
@@ -786,6 +809,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+        task_io_accounting_init(&sig->ioac);
        sig->sum_sched_runtime = 0;
        INIT_LIST_HEAD(&sig->cpu_timers[0]);
        INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -833,8 +857,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
        new_flags &= ~PF_SUPERPRIV;
        new_flags |= PF_FORKNOEXEC;
-        if (!(clone_flags & CLONE_PTRACE))
+        new_flags |= PF_STARTING;
-                p->ptrace = 0;
        p->flags = new_flags;
        clear_freeze_flag(p);
 }
@@ -875,7 +898,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                                        struct pt_regs *regs,
                                        unsigned long stack_size,
                                        int __user *child_tidptr,
-                                        struct pid *pid)
+                                        struct pid *pid,
+                                        int trace)
 {
        int retval;
        struct task_struct *p;
@@ -968,13 +992,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->last_switch_timestamp = 0;
 #endif
-#ifdef CONFIG_TASK_XACCT
+        task_io_accounting_init(&p->ioac);
-        p->rchar = 0;           /* I/O counter: bytes read */
-        p->wchar = 0;           /* I/O counter: bytes written */
-        p->syscr = 0;           /* I/O counter: read syscalls */
-        p->syscw = 0;           /* I/O counter: write syscalls */
-#endif
-        task_io_accounting_init(p);
        acct_clear_integrals(p);
        p->it_virt_expires = cputime_zero;
@@ -1081,6 +1099,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (clone_flags & CLONE_THREAD)
                p->tgid = current->tgid;
+        if (current->nsproxy != p->nsproxy) {
+                retval = ns_cgroup_clone(p, pid);
+                if (retval)
+                        goto bad_fork_free_pid;
+        }
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
        /*
         * Clear TID on mm_release()?
@@ -1125,8 +1149,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         */
        p->group_leader = p;
        INIT_LIST_HEAD(&p->thread_group);
-        INIT_LIST_HEAD(&p->ptrace_entry);
-        INIT_LIST_HEAD(&p->ptraced);
        /* Now that the task is set up, run cgroup callbacks if
         * necessary. We need to run them before the task is visible
@@ -1157,7 +1179,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                p->real_parent = current->real_parent;
        else
                p->real_parent = current;
-        p->parent = p->real_parent;
        spin_lock(&current->sighand->siglock);
@@ -1199,8 +1220,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (likely(p->pid)) {
                list_add_tail(&p->sibling, &p->real_parent->children);
-                if (unlikely(p->ptrace & PT_PTRACED))
+                tracehook_finish_clone(p, clone_flags, trace);
-                        __ptrace_link(p, current->parent);
                if (thread_group_leader(p)) {
                        if (clone_flags & CLONE_NEWPID)
@@ -1285,29 +1305,13 @@ struct task_struct * __cpuinit fork_idle(int cpu)
        struct pt_regs regs;
        task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
-                                &init_struct_pid);
+                            &init_struct_pid, 0);
        if (!IS_ERR(task))
                init_idle(task, cpu);
        return task;
 }
-static int fork_traceflag(unsigned clone_flags)
-{
-        if (clone_flags & CLONE_UNTRACED)
-                return 0;
-        else if (clone_flags & CLONE_VFORK) {
-                if (current->ptrace & PT_TRACE_VFORK)
-                        return PTRACE_EVENT_VFORK;
-        } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
-                if (current->ptrace & PT_TRACE_CLONE)
-                        return PTRACE_EVENT_CLONE;
-        } else if (current->ptrace & PT_TRACE_FORK)
-                return PTRACE_EVENT_FORK;
-        return 0;
-}
 /*
 *  Ok, this is the main fork-routine.
 *
@@ -1342,14 +1346,14 @@ long do_fork(unsigned long clone_flags,
                }
        }
-        if (unlikely(current->ptrace)) {
+        /*
-                trace = fork_traceflag (clone_flags);
+         * When called from kernel_thread, don't do user tracing stuff.
-                if (trace)
+         */
-                        clone_flags |= CLONE_PTRACE;
+        if (likely(user_mode(regs)))
-        }
+                trace = tracehook_prepare_clone(clone_flags);
        p = copy_process(clone_flags, stack_start, regs, stack_size,
-                        child_tidptr, NULL);
+                         child_tidptr, NULL, trace);
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
@@ -1367,32 +1371,35 @@ long do_fork(unsigned long clone_flags,
                        init_completion(&vfork);
                }
-                if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
+                tracehook_report_clone(trace, regs, clone_flags, nr, p);
+                /*
+                 * We set PF_STARTING at creation in case tracing wants to
+                 * use this to distinguish a fully live task from one that
+                 * hasn't gotten to tracehook_report_clone() yet.  Now we
+                 * clear it and set the child going.
+                 */
+                p->flags &= ~PF_STARTING;
+                if (unlikely(clone_flags & CLONE_STOPPED)) {
                        /*
                         * We'll start up with an immediate SIGSTOP.
                         */
                        sigaddset(&p->pending.signal, SIGSTOP);
                        set_tsk_thread_flag(p, TIF_SIGPENDING);
-                }
-                if (!(clone_flags & CLONE_STOPPED))
-                        wake_up_new_task(p, clone_flags);
-                else
                        __set_task_state(p, TASK_STOPPED);
+                } else {
-                if (unlikely (trace)) {
+                        wake_up_new_task(p, clone_flags);
-                        current->ptrace_message = nr;
-                        ptrace_notify ((trace << 8) | SIGTRAP);
                }
+                tracehook_report_clone_complete(trace, regs,
+                                                clone_flags, nr, p);
                if (clone_flags & CLONE_VFORK) {
                        freezer_do_not_count();
                        wait_for_completion(&vfork);
                        freezer_count();
-                        if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
+                        tracehook_report_vfork_done(p, nr);
-                                current->ptrace_message = nr;
-                                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
-                        }
                }
        } else {
                nr = PTR_ERR(p);
@@ -1404,7 +1411,7 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
-static void sighand_ctor(struct kmem_cache *cachep, void *data)
+static void sighand_ctor(void *data)
 {
        struct sighand_struct *sighand = data;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 964964baefa2..3cd441ebf5d2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -28,8 +28,7 @@ void dynamic_irq_init(unsigned int irq)
        unsigned long flags;
        if (irq >= NR_IRQS) {
-                printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
+                WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
-                WARN_ON(1);
                return;
        }
@@ -62,8 +61,7 @@ void dynamic_irq_cleanup(unsigned int irq)
        unsigned long flags;
        if (irq >= NR_IRQS) {
-                printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
+                WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
-                WARN_ON(1);
                return;
        }
@@ -71,9 +69,8 @@ void dynamic_irq_cleanup(unsigned int irq)
        spin_lock_irqsave(&desc->lock, flags);
        if (desc->action) {
                spin_unlock_irqrestore(&desc->lock, flags);
-                printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n",
+                WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
                        irq);
-                WARN_ON(1);
                return;
        }
        desc->msi_desc = NULL;
@@ -96,8 +93,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
        unsigned long flags;
        if (irq >= NR_IRQS) {
-                printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+                WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
-                WARN_ON(1);
                return -EINVAL;
        }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 77a51be36010..0314074fa232 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -177,8 +177,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)
 {
        switch (desc->depth) {
        case 0:
-                printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+                WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
-                WARN_ON(1);
                break;
        case 1: {
                unsigned int status = desc->status & ~IRQ_DISABLED;
@@ -217,6 +216,17 @@ void enable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(enable_irq);
+int set_irq_wake_real(unsigned int irq, unsigned int on)
+{
+        struct irq_desc *desc = irq_desc + irq;
+        int ret = -ENXIO;
+        if (desc->chip->set_wake)
+                ret = desc->chip->set_wake(irq, on);
+        return ret;
+}
 /**
 *      set_irq_wake - control irq power management wakeup
 *      @irq:   interrupt to control
@@ -233,30 +243,32 @@ int set_irq_wake(unsigned int irq, unsigned int on)
 {
        struct irq_desc *desc = irq_desc + irq;
        unsigned long flags;
-        int ret = -ENXIO;
+        int ret = 0;
-        int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
        /* wakeup-capable irqs can be shared between drivers that
         * don't need to have the same sleep mode behaviors.
         */
        spin_lock_irqsave(&desc->lock, flags);
        if (on) {
-                if (desc->wake_depth++ == 0)
+                if (desc->wake_depth++ == 0) {
-                        desc->status |= IRQ_WAKEUP;
+                        ret = set_irq_wake_real(irq, on);
-                else
+                        if (ret)
-                        set_wake = NULL;
+                                desc->wake_depth = 0;
+                        else
+                                desc->status |= IRQ_WAKEUP;
+                }
        } else {
                if (desc->wake_depth == 0) {
-                        printk(KERN_WARNING "Unbalanced IRQ %d "
+                        WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
-                                        "wake disable\n", irq);
+                } else if (--desc->wake_depth == 0) {
-                        WARN_ON(1);
+                        ret = set_irq_wake_real(irq, on);
-                } else if (--desc->wake_depth == 0)
+                        if (ret)
-                        desc->status &= ~IRQ_WAKEUP;
+                                desc->wake_depth = 1;
-                else
+                        else
-                        set_wake = NULL;
+                                desc->status &= ~IRQ_WAKEUP;
+                }
        }
-        if (set_wake)
-                ret = desc->chip->set_wake(irq, on);
        spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
@@ -293,6 +305,31 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
                desc->handle_irq = NULL;
 }
+static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
+                unsigned long flags)
+{
+        int ret;
+        if (!chip || !chip->set_type) {
+                /*
+                 * IRQF_TRIGGER_* but the PIC does not support multiple
+                 * flow-types?
+                 */
+                pr_warning("No set_type function for IRQ %d (%s)\n", irq,
+                                chip ? (chip->name ? : "unknown") : "unknown");
+                return 0;
+        }
+        ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
+        if (ret)
+                pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
+                                (int)(flags & IRQF_TRIGGER_MASK),
+                                irq, chip->set_type);
+        return ret;
+}
 /*
 * Internal function to register an irqaction - typically used to
 * allocate special interrupts that are part of the architecture.
@@ -304,6 +341,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
        const char *old_name = NULL;
        unsigned long flags;
        int shared = 0;
+        int ret;
        if (irq >= NR_IRQS)
                return -EINVAL;
@@ -361,35 +399,23 @@ int setup_irq(unsigned int irq, struct irqaction *new)
                shared = 1;
        }
-        *p = new;
-        /* Exclude IRQ from balancing */
-        if (new->flags & IRQF_NOBALANCING)
-                desc->status |= IRQ_NO_BALANCING;
        if (!shared) {
                irq_chip_set_defaults(desc->chip);
-#if defined(CONFIG_IRQ_PER_CPU)
-                if (new->flags & IRQF_PERCPU)
-                        desc->status |= IRQ_PER_CPU;
-#endif
                /* Setup the type (level, edge polarity) if configured: */
                if (new->flags & IRQF_TRIGGER_MASK) {
-                        if (desc->chip->set_type)
+                        ret = __irq_set_trigger(desc->chip, irq, new->flags);
-                                desc->chip->set_type(irq,
-                                                new->flags & IRQF_TRIGGER_MASK);
+                        if (ret) {
-                        else
+                                spin_unlock_irqrestore(&desc->lock, flags);
-                                /*
+                                return ret;
-                                 * IRQF_TRIGGER_* but the PIC does not support
+                        }
-                                 * multiple flow-types?
-                                 */
-                                printk(KERN_WARNING "No IRQF_TRIGGER set_type "
-                                       "function for IRQ %d (%s)\n", irq,
-                                       desc->chip->name);
                } else
                        compat_irq_chip_set_default_handler(desc);
+#if defined(CONFIG_IRQ_PER_CPU)
+                if (new->flags & IRQF_PERCPU)
+                        desc->status |= IRQ_PER_CPU;
+#endif
                desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
                                  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
@@ -408,6 +434,13 @@ int setup_irq(unsigned int irq, struct irqaction *new)
                /* Set default affinity mask once everything is setup */
                irq_select_affinity(irq);
        }
+        *p = new;
+        /* Exclude IRQ from balancing */
+        if (new->flags & IRQF_NOBALANCING)
+                desc->status |= IRQ_NO_BALANCING;
        /* Reset broken irq detection when installing new handler */
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6fc0040f3e3a..38fc10ac7541 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -176,7 +176,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
        high = kallsyms_num_syms;
        while (high - low > 1) {
-                mid = (low + high) / 2;
+                mid = low + (high - low) / 2;
                if (kallsyms_addresses[mid] <= addr)
                        low = mid;
                else
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 1c5fcacbcf33..c8a4370e2a34 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -24,6 +24,12 @@
 #include <linux/utsrelease.h>
 #include <linux/utsname.h>
 #include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -242,6 +248,12 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
                goto out;
        }
+        image->swap_page = kimage_alloc_control_pages(image, 0);
+        if (!image->swap_page) {
+                printk(KERN_ERR "Could not allocate swap buffer\n");
+                goto out;
+        }
        result = 0;
 out:
        if (result == 0)
@@ -589,14 +601,12 @@ static void kimage_free_extra_pages(struct kimage *image)
        kimage_free_page_list(&image->unuseable_pages);
 }
-static int kimage_terminate(struct kimage *image)
+static void kimage_terminate(struct kimage *image)
 {
        if (*image->entry != 0)
                image->entry++;
        *image->entry = IND_DONE;
-        return 0;
 }
 #define for_each_kimage_entry(image, ptr, entry) \
@@ -988,6 +998,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
                if (result)
                        goto out;
+                if (flags & KEXEC_PRESERVE_CONTEXT)
+                        image->preserve_context = 1;
                result = machine_kexec_prepare(image);
                if (result)
                        goto out;
@@ -997,9 +1009,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
                        if (result)
                                goto out;
                }
-                result = kimage_terminate(image);
+                kimage_terminate(image);
-                if (result)
-                        goto out;
        }
        /* Install the new kernel, and  Uninstall the old */
        image = xchg(dest_image, image);
@@ -1415,3 +1425,85 @@ static int __init crash_save_vmcoreinfo_init(void)
 }
 module_init(crash_save_vmcoreinfo_init)
+/**
+ *      kernel_kexec - reboot the system
+ *
+ *      Move into place and start executing a preloaded standalone
+ *      executable.  If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+        int error = 0;
+        if (xchg(&kexec_lock, 1))
+                return -EBUSY;
+        if (!kexec_image) {
+                error = -EINVAL;
+                goto Unlock;
+        }
+        if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+                mutex_lock(&pm_mutex);
+                pm_prepare_console();
+                error = freeze_processes();
+                if (error) {
+                        error = -EBUSY;
+                        goto Restore_console;
+                }
+                suspend_console();
+                error = device_suspend(PMSG_FREEZE);
+                if (error)
+                        goto Resume_console;
+                error = disable_nonboot_cpus();
+                if (error)
+                        goto Resume_devices;
+                local_irq_disable();
+                /* At this point, device_suspend() has been called,
+                 * but *not* device_power_down(). We *must*
+                 * device_power_down() now.  Otherwise, drivers for
+                 * some devices (e.g. interrupt controllers) become
+                 * desynchronized with the actual state of the
+                 * hardware at resume time, and evil weirdness ensues.
+                 */
+                error = device_power_down(PMSG_FREEZE);
+                if (error)
+                        goto Enable_irqs;
+                save_processor_state();
+#endif
+        } else {
+                blocking_notifier_call_chain(&reboot_notifier_list,
+                                             SYS_RESTART, NULL);
+                system_state = SYSTEM_RESTART;
+                device_shutdown();
+                sysdev_shutdown();
+                printk(KERN_EMERG "Starting new kernel\n");
+                machine_shutdown();
+        }
+        machine_kexec(kexec_image);
+        if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+                restore_processor_state();
+                device_power_up(PMSG_RESTORE);
+ Enable_irqs:
+                local_irq_enable();
+                enable_nonboot_cpus();
+ Resume_devices:
+                device_resume(PMSG_RESTORE);
+ Resume_console:
+                resume_console();
+                thaw_processes();
+ Restore_console:
+                pm_restore_console();
+                mutex_unlock(&pm_mutex);
+#endif
+        }
+ Unlock:
+        xchg(&kexec_lock, 0);
+        return error;
+}
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 3ec23c3ec97f..eaa21fc9ad1d 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -56,12 +56,14 @@
 static int kgdb_break_asap;
+#define KGDB_MAX_THREAD_QUERY 17
 struct kgdb_state {
        int                     ex_vector;
        int                     signo;
        int                     err_code;
        int                     cpu;
        int                     pass_exception;
+        unsigned long           thr_query;
        unsigned long           threadid;
        long                    kgdb_usethreadid;
        struct pt_regs          *linux_regs;
@@ -166,13 +168,6 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
 * Weak aliases for breakpoint management,
 * can be overriden by architectures when needed:
 */
-int __weak kgdb_validate_break_address(unsigned long addr)
-{
-        char tmp_variable[BREAK_INSTR_SIZE];
-        return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE);
-}
 int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
 {
        int err;
@@ -191,6 +186,25 @@ int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
                                  (char *)bundle, BREAK_INSTR_SIZE);
 }
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+        char tmp_variable[BREAK_INSTR_SIZE];
+        int err;
+        /* Validate setting the breakpoint and then removing it.  In the
+         * remove fails, the kernel needs to emit a bad message because we
+         * are deep trouble not being able to put things back the way we
+         * found them.
+         */
+        err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+        if (err)
+                return err;
+        err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+        if (err)
+                printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+                   "memory destroyed at: %lx", addr);
+        return err;
+}
 unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
 {
        return instruction_pointer(regs);
@@ -433,9 +447,14 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
 {
        int hex_val;
        int num = 0;
+        int negate = 0;
        *long_val = 0;
+        if (**ptr == '-') {
+                negate = 1;
+                (*ptr)++;
+        }
        while (**ptr) {
                hex_val = hex(**ptr);
                if (hex_val < 0)
@@ -446,6 +465,9 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
                (*ptr)++;
        }
+        if (negate)
+                *long_val = -*long_val;
        return num;
 }
@@ -515,10 +537,16 @@ static void int_to_threadref(unsigned char *id, int value)
 static struct task_struct *getthread(struct pt_regs *regs, int tid)
 {
        /*
-         * Non-positive TIDs are remapped idle tasks:
+         * Non-positive TIDs are remapped to the cpu shadow information
         */
-        if (tid <= 0)
+        if (tid == 0 || tid == -1)
-                return idle_task(-tid);
+                tid = -atomic_read(&kgdb_active) - 2;
+        if (tid < 0) {
+                if (kgdb_info[-tid - 2].task)
+                        return kgdb_info[-tid - 2].task;
+                else
+                        return idle_task(-tid - 2);
+        }
        /*
         * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -725,14 +753,15 @@ setundefined:
 }
 /*
- * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs:
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
 */
 static inline int shadow_pid(int realpid)
 {
        if (realpid)
                return realpid;
-        return -1-raw_smp_processor_id();
+        return -raw_smp_processor_id() - 2;
 }
 static char gdbmsgbuf[BUFMAX + 1];
@@ -826,7 +855,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
                local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
        } else {
                local_debuggerinfo = NULL;
-                for (i = 0; i < NR_CPUS; i++) {
+                for_each_online_cpu(i) {
                        /*
                         * Try to find the task on some other
                         * or possibly this node if we do not
@@ -960,10 +989,13 @@ static int gdb_cmd_reboot(struct kgdb_state *ks)
 /* Handle the 'q' query packets */
 static void gdb_cmd_query(struct kgdb_state *ks)
 {
-        struct task_struct *thread;
+        struct task_struct *g;
+        struct task_struct *p;
        unsigned char thref[8];
        char *ptr;
        int i;
+        int cpu;
+        int finished = 0;
        switch (remcom_in_buffer[1]) {
        case 's':
@@ -973,22 +1005,34 @@ static void gdb_cmd_query(struct kgdb_state *ks)
                        break;
                }
-                if (remcom_in_buffer[1] == 'f')
+                i = 0;
-                        ks->threadid = 1;
                remcom_out_buffer[0] = 'm';
                ptr = remcom_out_buffer + 1;
+                if (remcom_in_buffer[1] == 'f') {
-                for (i = 0; i < 17; ks->threadid++) {
+                        /* Each cpu is a shadow thread */
-                        thread = getthread(ks->linux_regs, ks->threadid);
+                        for_each_online_cpu(cpu) {
-                        if (thread) {
+                                ks->thr_query = 0;
-                                int_to_threadref(thref, ks->threadid);
+                                int_to_threadref(thref, -cpu - 2);
                                pack_threadid(ptr, thref);
                                ptr += BUF_THREAD_ID_SIZE;
                                *(ptr++) = ',';
                                i++;
                        }
                }
+                do_each_thread(g, p) {
+                        if (i >= ks->thr_query && !finished) {
+                                int_to_threadref(thref, p->pid);
+                                pack_threadid(ptr, thref);
+                                ptr += BUF_THREAD_ID_SIZE;
+                                *(ptr++) = ',';
+                                ks->thr_query++;
+                                if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+                                        finished = 1;
+                        }
+                        i++;
+                } while_each_thread(g, p);
                *(--ptr) = '\0';
                break;
@@ -1011,15 +1055,15 @@ static void gdb_cmd_query(struct kgdb_state *ks)
                        error_packet(remcom_out_buffer, -EINVAL);
                        break;
                }
-                if (ks->threadid > 0) {
+                if ((int)ks->threadid > 0) {
                        kgdb_mem2hex(getthread(ks->linux_regs,
                                        ks->threadid)->comm,
                                        remcom_out_buffer, 16);
                } else {
                        static char tmpstr[23 + BUF_THREAD_ID_SIZE];
-                        sprintf(tmpstr, "Shadow task %d for pid 0",
+                        sprintf(tmpstr, "shadowCPU%d",
-                                        (int)(-ks->threadid-1));
+                                        (int)(-ks->threadid - 2));
                        kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
                }
                break;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8df97d3dfda8..2456d1a0befb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -42,7 +42,7 @@ extern int max_threads;
 static struct workqueue_struct *khelper_wq;
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 /*
        modprobe_path is set via /proc/sys.
@@ -352,16 +352,17 @@ static inline void register_pm_notifier_callback(void) {}
 * @path: path to usermode executable
 * @argv: arg vector for process
 * @envp: environment for process
+ * @gfp_mask: gfp mask for memory allocation
 *
 * Returns either %NULL on allocation failure, or a subprocess_info
 * structure.  This should be passed to call_usermodehelper_exec to
 * exec the process and free the structure.
 */
-struct subprocess_info *call_usermodehelper_setup(char *path,
+struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
-                                                  char **argv, char **envp)
+                                                  char **envp, gfp_t gfp_mask)
 {
        struct subprocess_info *sub_info;
-        sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+        sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
        if (!sub_info)
                goto out;
@@ -417,12 +418,12 @@ int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
 {
        struct file *f;
-        f = create_write_pipe();
+        f = create_write_pipe(0);
        if (IS_ERR(f))
                return PTR_ERR(f);
        *filp = f;
-        f = create_read_pipe(f);
+        f = create_read_pipe(f, 0);
        if (IS_ERR(f)) {
                free_write_pipe(*filp);
                return PTR_ERR(f);
@@ -494,7 +495,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
        struct subprocess_info *sub_info;
        int ret;
-        sub_info = call_usermodehelper_setup(path, argv, envp);
+        sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
        if (sub_info == NULL)
                return -ENOMEM;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1485ca8d0e00..75bc2cd9ebc6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -62,6 +62,7 @@
        addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
 #endif
+static int kprobes_initialized;
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
@@ -69,8 +70,15 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 static bool kprobe_enabled;
 DEFINE_MUTEX(kprobe_mutex);             /* Protects kprobe_table */
-DEFINE_SPINLOCK(kretprobe_lock);        /* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+static struct {
+        spinlock_t lock ____cacheline_aligned;
+} kretprobe_table_locks[KPROBE_TABLE_SIZE];
+static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+{
+        return &(kretprobe_table_locks[hash].lock);
+}
 /*
 * Normally, functions that we'd want to prohibit kprobes in, are marked
@@ -368,26 +376,53 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
        return;
 }
-/* Called with kretprobe_lock held */
 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
                                struct hlist_head *head)
 {
+        struct kretprobe *rp = ri->rp;
        /* remove rp inst off the rprobe_inst_table */
        hlist_del(&ri->hlist);
-        if (ri->rp) {
+        INIT_HLIST_NODE(&ri->hlist);
-                /* remove rp inst off the used list */
+        if (likely(rp)) {
-                hlist_del(&ri->uflist);
+                spin_lock(&rp->lock);
-                /* put rp inst back onto the free list */
+                hlist_add_head(&ri->hlist, &rp->free_instances);
-                INIT_HLIST_NODE(&ri->uflist);
+                spin_unlock(&rp->lock);
-                hlist_add_head(&ri->uflist, &ri->rp->free_instances);
        } else
                /* Unregistering */
                hlist_add_head(&ri->hlist, head);
 }
-struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
+void kretprobe_hash_lock(struct task_struct *tsk,
+                         struct hlist_head **head, unsigned long *flags)
 {
-        return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+        spinlock_t *hlist_lock;
+        *head = &kretprobe_inst_table[hash];
+        hlist_lock = kretprobe_table_lock_ptr(hash);
+        spin_lock_irqsave(hlist_lock, *flags);
+}
+void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
+{
+        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+        spin_lock_irqsave(hlist_lock, *flags);
+}
+void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
+{
+        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+        spinlock_t *hlist_lock;
+        hlist_lock = kretprobe_table_lock_ptr(hash);
+        spin_unlock_irqrestore(hlist_lock, *flags);
+}
+void kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+{
+        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+        spin_unlock_irqrestore(hlist_lock, *flags);
 }
 /*
@@ -401,17 +436,21 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
        struct kretprobe_instance *ri;
        struct hlist_head *head, empty_rp;
        struct hlist_node *node, *tmp;
-        unsigned long flags = 0;
+        unsigned long hash, flags = 0;
-        INIT_HLIST_HEAD(&empty_rp);
+        if (unlikely(!kprobes_initialized))
-        spin_lock_irqsave(&kretprobe_lock, flags);
+                /* Early boot.  kretprobe_table_locks not yet initialized. */
-        head = kretprobe_inst_table_head(tk);
+                return;
+        hash = hash_ptr(tk, KPROBE_HASH_BITS);
+        head = &kretprobe_inst_table[hash];
+        kretprobe_table_lock(hash, &flags);
        hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
                if (ri->task == tk)
                        recycle_rp_inst(ri, &empty_rp);
        }
-        spin_unlock_irqrestore(&kretprobe_lock, flags);
+        kretprobe_table_unlock(hash, &flags);
+        INIT_HLIST_HEAD(&empty_rp);
        hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
                hlist_del(&ri->hlist);
                kfree(ri);
@@ -423,24 +462,29 @@ static inline void free_rp_inst(struct kretprobe *rp)
        struct kretprobe_instance *ri;
        struct hlist_node *pos, *next;
-        hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) {
+        hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
-                hlist_del(&ri->uflist);
+                hlist_del(&ri->hlist);
                kfree(ri);
        }
 }
 static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
 {
-        unsigned long flags;
+        unsigned long flags, hash;
        struct kretprobe_instance *ri;
        struct hlist_node *pos, *next;
+        struct hlist_head *head;
        /* No race here */
-        spin_lock_irqsave(&kretprobe_lock, flags);
+        for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
-        hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
+                kretprobe_table_lock(hash, &flags);
-                ri->rp = NULL;
+                head = &kretprobe_inst_table[hash];
-                hlist_del(&ri->uflist);
+                hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+                        if (ri->rp == rp)
+                                ri->rp = NULL;
+                }
+                kretprobe_table_unlock(hash, &flags);
        }
-        spin_unlock_irqrestore(&kretprobe_lock, flags);
        free_rp_inst(rp);
 }
@@ -831,32 +875,37 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
                                           struct pt_regs *regs)
 {
        struct kretprobe *rp = container_of(p, struct kretprobe, kp);
-        unsigned long flags = 0;
+        unsigned long hash, flags = 0;
+        struct kretprobe_instance *ri;
        /*TODO: consider to only swap the RA after the last pre_handler fired */
-        spin_lock_irqsave(&kretprobe_lock, flags);
+        hash = hash_ptr(current, KPROBE_HASH_BITS);
+        spin_lock_irqsave(&rp->lock, flags);
        if (!hlist_empty(&rp->free_instances)) {
-                struct kretprobe_instance *ri;
                ri = hlist_entry(rp->free_instances.first,
-                                 struct kretprobe_instance, uflist);
+                                struct kretprobe_instance, hlist);
+                hlist_del(&ri->hlist);
+                spin_unlock_irqrestore(&rp->lock, flags);
                ri->rp = rp;
                ri->task = current;
                if (rp->entry_handler && rp->entry_handler(ri, regs)) {
-                        spin_unlock_irqrestore(&kretprobe_lock, flags);
+                        spin_unlock_irqrestore(&rp->lock, flags);
                        return 0;
                }
                arch_prepare_kretprobe(ri, regs);
                /* XXX(hch): why is there no hlist_move_head? */
-                hlist_del(&ri->uflist);
+                INIT_HLIST_NODE(&ri->hlist);
-                hlist_add_head(&ri->uflist, &ri->rp->used_instances);
+                kretprobe_table_lock(hash, &flags);
-                hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task));
+                hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
-        } else
+                kretprobe_table_unlock(hash, &flags);
+        } else {
                rp->nmissed++;
-        spin_unlock_irqrestore(&kretprobe_lock, flags);
+                spin_unlock_irqrestore(&rp->lock, flags);
+        }
        return 0;
 }
@@ -892,7 +941,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
                rp->maxactive = NR_CPUS;
 #endif
        }
-        INIT_HLIST_HEAD(&rp->used_instances);
+        spin_lock_init(&rp->lock);
        INIT_HLIST_HEAD(&rp->free_instances);
        for (i = 0; i < rp->maxactive; i++) {
                inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -901,8 +950,8 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
                        free_rp_inst(rp);
                        return -ENOMEM;
                }
-                INIT_HLIST_NODE(&inst->uflist);
+                INIT_HLIST_NODE(&inst->hlist);
-                hlist_add_head(&inst->uflist, &rp->free_instances);
+                hlist_add_head(&inst->hlist, &rp->free_instances);
        }
        rp->nmissed = 0;
@@ -1009,6 +1058,7 @@ static int __init init_kprobes(void)
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                INIT_HLIST_HEAD(&kprobe_table[i]);
                INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
+                spin_lock_init(&(kretprobe_table_locks[i].lock));
        }
        /*
@@ -1050,6 +1100,7 @@ static int __init init_kprobes(void)
        err = arch_init_kprobes();
        if (!err)
                err = register_die_notifier(&kprobe_exceptions_nb);
+        kprobes_initialized = (err == 0);
        if (!err)
                init_test_probes();
@@ -1286,13 +1337,8 @@ EXPORT_SYMBOL_GPL(register_jprobe);
 EXPORT_SYMBOL_GPL(unregister_jprobe);
 EXPORT_SYMBOL_GPL(register_jprobes);
 EXPORT_SYMBOL_GPL(unregister_jprobes);
-#ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(jprobe_return);
-#endif
-#ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(register_kretprobe);
 EXPORT_SYMBOL_GPL(unregister_kretprobe);
 EXPORT_SYMBOL_GPL(register_kretprobes);
 EXPORT_SYMBOL_GPL(unregister_kretprobes);
-#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ac3fb7326641..96cff2f8710b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -106,7 +106,7 @@ static void create_kthread(struct kthread_create_info *create)
                 */
                sched_setscheduler(create->result, SCHED_NORMAL, &param);
                set_user_nice(create->result, KTHREAD_NICE_LEVEL);
-                set_cpus_allowed(create->result, CPU_MASK_ALL);
+                set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR);
        }
        complete(&create->done);
 }
@@ -176,7 +176,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
                return;
        }
        /* Must have done schedule() in kthread() before we set_task_cpu */
-        wait_task_inactive(k);
+        wait_task_inactive(k, 0);
        set_task_cpu(k, cpu);
        k->cpus_allowed = cpumask_of_cpu(cpu);
        k->rt.nr_cpus_allowed = 1;
@@ -233,7 +233,7 @@ int kthreadd(void *unused)
        set_task_comm(tsk, "kthreadd");
        ignore_signals(tsk);
        set_user_nice(tsk, KTHREAD_NICE_LEVEL);
-        set_cpus_allowed(tsk, CPU_MASK_ALL);
+        set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
        current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
diff --git a/kernel/marker.c b/kernel/marker.c
index 1abfb923b761..7d1faecd7a51 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -126,6 +126,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
                struct marker_probe_closure *multi;
                int i;
                /*
+                 * Read mdata->ptype before mdata->multi.
+                 */
+                smp_rmb();
+                multi = mdata->multi;
+                /*
                 * multi points to an array, therefore accessing the array
                 * depends on reading multi. However, even in this case,
                 * we must insure that the pointer is read _before_ the array
@@ -133,7 +138,6 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
                 * in the fast path, so put the explicit barrier here.
                 */
                smp_read_barrier_depends();
-                multi = mdata->multi;
                for (i = 0; multi[i].func; i++) {
                        va_start(args, call_private);
                        multi[i].func(multi[i].probe_private, call_private,
@@ -175,6 +179,11 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
                struct marker_probe_closure *multi;
                int i;
                /*
+                 * Read mdata->ptype before mdata->multi.
+                 */
+                smp_rmb();
+                multi = mdata->multi;
+                /*
                 * multi points to an array, therefore accessing the array
                 * depends on reading multi. However, even in this case,
                 * we must insure that the pointer is read _before_ the array
@@ -182,7 +191,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
                 * in the fast path, so put the explicit barrier here.
                 */
                smp_read_barrier_depends();
-                multi = mdata->multi;
                for (i = 0; multi[i].func; i++)
                        multi[i].func(multi[i].probe_private, call_private,
                                mdata->format, &args);
@@ -441,7 +449,7 @@ static int remove_marker(const char *name)
        hlist_del(&e->hlist);
        /* Make sure the call_rcu has been executed */
        if (e->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        kfree(e);
        return 0;
 }
@@ -476,7 +484,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
        hlist_del(&(*entry)->hlist);
        /* Make sure the call_rcu has been executed */
        if ((*entry)->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        kfree(*entry);
        *entry = e;
        trace_mark(core_marker_format, "name %s format %s",
@@ -655,7 +663,7 @@ int marker_probe_register(const char *name, const char *format,
         * make sure it's executed now.
         */
        if (entry->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        old = marker_entry_add_probe(entry, probe, probe_private);
        if (IS_ERR(old)) {
                ret = PTR_ERR(old);
@@ -670,10 +678,7 @@ int marker_probe_register(const char *name, const char *format,
        entry->rcu_pending = 1;
        /* write rcu_pending before calling the RCU callback */
        smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
+        call_rcu_sched(&entry->rcu, free_old_closure);
-        synchronize_sched();    /* Until we have the call_rcu_sched() */
-#endif
-        call_rcu(&entry->rcu, free_old_closure);
 end:
        mutex_unlock(&markers_mutex);
        return ret;
@@ -704,7 +709,7 @@ int marker_probe_unregister(const char *name,
        if (!entry)
                goto end;
        if (entry->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        old = marker_entry_remove_probe(entry, probe, probe_private);
        mutex_unlock(&markers_mutex);
        marker_update_probes();         /* may update entry */
@@ -716,10 +721,7 @@ int marker_probe_unregister(const char *name,
        entry->rcu_pending = 1;
        /* write rcu_pending before calling the RCU callback */
        smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
+        call_rcu_sched(&entry->rcu, free_old_closure);
-        synchronize_sched();    /* Until we have the call_rcu_sched() */
-#endif
-        call_rcu(&entry->rcu, free_old_closure);
        remove_marker(name);    /* Ignore busy error message */
        ret = 0;
 end:
@@ -786,7 +788,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
                goto end;
        }
        if (entry->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        old = marker_entry_remove_probe(entry, NULL, probe_private);
        mutex_unlock(&markers_mutex);
        marker_update_probes();         /* may update entry */
@@ -797,10 +799,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
        entry->rcu_pending = 1;
        /* write rcu_pending before calling the RCU callback */
        smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
+        call_rcu_sched(&entry->rcu, free_old_closure);
-        synchronize_sched();    /* Until we have the call_rcu_sched() */
-#endif
-        call_rcu(&entry->rcu, free_old_closure);
        remove_marker(entry->name);     /* Ignore busy error message */
 end:
        mutex_unlock(&markers_mutex);
diff --git a/kernel/module.c b/kernel/module.c
index 5f80478b746d..61d212120df4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -70,6 +70,9 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
+/* Bounds of module allocation, for speeding __module_text_address */
+static unsigned long module_addr_min = -1UL, module_addr_max = 0;
 int register_module_notifier(struct notifier_block * nb)
 {
        return blocking_notifier_chain_register(&module_notify_list, nb);
@@ -134,17 +137,19 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const struct kernel_symbol __start___ksymtab_unused[];
-extern const struct kernel_symbol __stop___ksymtab_unused[];
-extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const unsigned long __start___kcrctab[];
 extern const unsigned long __start___kcrctab_gpl[];
 extern const unsigned long __start___kcrctab_gpl_future[];
+#ifdef CONFIG_UNUSED_SYMBOLS
+extern const struct kernel_symbol __start___ksymtab_unused[];
+extern const struct kernel_symbol __stop___ksymtab_unused[];
+extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
 extern const unsigned long __start___kcrctab_unused[];
 extern const unsigned long __start___kcrctab_unused_gpl[];
+#endif
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
@@ -152,152 +157,170 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
-/* lookup symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_symbol(const char *name,
-        const struct kernel_symbol *start,
-        const struct kernel_symbol *stop)
-{
-        const struct kernel_symbol *ks = start;
-        for (; ks < stop; ks++)
-                if (strcmp(ks->name, name) == 0)
-                        return ks;
-        return NULL;
-}
-static bool always_ok(bool gplok, bool warn, const char *name)
-{
-        return true;
-}
-static bool printk_unused_warning(bool gplok, bool warn, const char *name)
-{
-        if (warn) {
-                printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
-                       "however this module is using it.\n", name);
-                printk(KERN_WARNING
-                       "This symbol will go away in the future.\n");
-                printk(KERN_WARNING
-                       "Please evalute if this is the right api to use and if "
-                       "it really is, submit a report the linux kernel "
-                       "mailinglist together with submitting your code for "
-                       "inclusion.\n");
-        }
-        return true;
-}
-static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name)
-{
-        if (!gplok)
-                return false;
-        return printk_unused_warning(gplok, warn, name);
-}
-static bool gpl_only(bool gplok, bool warn, const char *name)
-{
-        return gplok;
-}
-static bool warn_if_not_gpl(bool gplok, bool warn, const char *name)
-{
-        if (!gplok && warn) {
-                printk(KERN_WARNING "Symbol %s is being used "
-                       "by a non-GPL module, which will not "
-                       "be allowed in the future\n", name);
-                printk(KERN_WARNING "Please see the file "
-                       "Documentation/feature-removal-schedule.txt "
-                       "in the kernel source tree for more details.\n");
-        }
-        return true;
-}
 struct symsearch {
        const struct kernel_symbol *start, *stop;
        const unsigned long *crcs;
-        bool (*check)(bool gplok, bool warn, const char *name);
+        enum {
+                NOT_GPL_ONLY,
+                GPL_ONLY,
+                WILL_BE_GPL_ONLY,
+        } licence;
+        bool unused;
 };
-/* Look through this array of symbol tables for a symbol match which
+static bool each_symbol_in_section(const struct symsearch *arr,
- * passes the check function. */
+                                   unsigned int arrsize,
-static const struct kernel_symbol *search_symarrays(const struct symsearch *arr,
+                                   struct module *owner,
-                                                    unsigned int num,
+                                   bool (*fn)(const struct symsearch *syms,
-                                                    const char *name,
+                                              struct module *owner,
-                                                    bool gplok,
+                                              unsigned int symnum, void *data),
-                                                    bool warn,
+                                   void *data)
-                                                    const unsigned long **crc)
 {
-        unsigned int i;
+        unsigned int i, j;
-        const struct kernel_symbol *ks;
-        for (i = 0; i < num; i++) {
-                ks = lookup_symbol(name, arr[i].start, arr[i].stop);
-                if (!ks || !arr[i].check(gplok, warn, name))
-                        continue;
-                if (crc)
+        for (j = 0; j < arrsize; j++) {
-                        *crc = symversion(arr[i].crcs, ks - arr[i].start);
+                for (i = 0; i < arr[j].stop - arr[j].start; i++)
-                return ks;
+                        if (fn(&arr[j], owner, i, data))
+                                return true;
        }
-        return NULL;
+        return false;
 }
-/* Find a symbol, return value, (optional) crc and (optional) module
+/* Returns true as soon as fn returns true, otherwise false. */
- * which owns it */
+static bool each_symbol(bool (*fn)(const struct symsearch *arr,
-static unsigned long find_symbol(const char *name,
+                                   struct module *owner,
-                                 struct module **owner,
+                                   unsigned int symnum, void *data),
-                                 const unsigned long **crc,
+                        void *data)
-                                 bool gplok,
-                                 bool warn)
 {
        struct module *mod;
-        const struct kernel_symbol *ks;
        const struct symsearch arr[] = {
                { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
-                  always_ok },
+                  NOT_GPL_ONLY, false },
                { __start___ksymtab_gpl, __stop___ksymtab_gpl,
-                  __start___kcrctab_gpl, gpl_only },
+                  __start___kcrctab_gpl,
+                  GPL_ONLY, false },
                { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
-                  __start___kcrctab_gpl_future, warn_if_not_gpl },
+                  __start___kcrctab_gpl_future,
+                  WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
                { __start___ksymtab_unused, __stop___ksymtab_unused,
-                  __start___kcrctab_unused, printk_unused_warning },
+                  __start___kcrctab_unused,
+                  NOT_GPL_ONLY, true },
                { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
-                  __start___kcrctab_unused_gpl, gpl_only_unused_warning },
+                  __start___kcrctab_unused_gpl,
+                  GPL_ONLY, true },
+#endif
        };
-        /* Core kernel first. */
+        if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
-        ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc);
+                return true;
-        if (ks) {
-                if (owner)
-                        *owner = NULL;
-                return ks->value;
-        }
-        /* Now try modules. */
        list_for_each_entry(mod, &modules, list) {
                struct symsearch arr[] = {
                        { mod->syms, mod->syms + mod->num_syms, mod->crcs,
-                          always_ok },
+                          NOT_GPL_ONLY, false },
                        { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
-                          mod->gpl_crcs, gpl_only },
+                          mod->gpl_crcs,
+                          GPL_ONLY, false },
                        { mod->gpl_future_syms,
                          mod->gpl_future_syms + mod->num_gpl_future_syms,
-                          mod->gpl_future_crcs, warn_if_not_gpl },
+                          mod->gpl_future_crcs,
+                          WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
                        { mod->unused_syms,
                          mod->unused_syms + mod->num_unused_syms,
-                          mod->unused_crcs, printk_unused_warning },
+                          mod->unused_crcs,
+                          NOT_GPL_ONLY, true },
                        { mod->unused_gpl_syms,
                          mod->unused_gpl_syms + mod->num_unused_gpl_syms,
-                          mod->unused_gpl_crcs, gpl_only_unused_warning },
+                          mod->unused_gpl_crcs,
+                          GPL_ONLY, true },
+#endif
                };
-                ks = search_symarrays(arr, ARRAY_SIZE(arr),
+                if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
-                                      name, gplok, warn, crc);
+                        return true;
-                if (ks) {
+        }
-                        if (owner)
+        return false;
-                                *owner = mod;
+}
-                        return ks->value;
+struct find_symbol_arg {
+        /* Input */
+        const char *name;
+        bool gplok;
+        bool warn;
+        /* Output */
+        struct module *owner;
+        const unsigned long *crc;
+        unsigned long value;
+};
+static bool find_symbol_in_section(const struct symsearch *syms,
+                                   struct module *owner,
+                                   unsigned int symnum, void *data)
+{
+        struct find_symbol_arg *fsa = data;
+        if (strcmp(syms->start[symnum].name, fsa->name) != 0)
+                return false;
+        if (!fsa->gplok) {
+                if (syms->licence == GPL_ONLY)
+                        return false;
+                if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
+                        printk(KERN_WARNING "Symbol %s is being used "
+                               "by a non-GPL module, which will not "
+                               "be allowed in the future\n", fsa->name);
+                        printk(KERN_WARNING "Please see the file "
+                               "Documentation/feature-removal-schedule.txt "
+                               "in the kernel source tree for more details.\n");
                }
        }
+#ifdef CONFIG_UNUSED_SYMBOLS
+        if (syms->unused && fsa->warn) {
+                printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
+                       "however this module is using it.\n", fsa->name);
+                printk(KERN_WARNING
+                       "This symbol will go away in the future.\n");
+                printk(KERN_WARNING
+                       "Please evalute if this is the right api to use and if "
+                       "it really is, submit a report the linux kernel "
+                       "mailinglist together with submitting your code for "
+                       "inclusion.\n");
+        }
+#endif
+        fsa->owner = owner;
+        fsa->crc = symversion(syms->crcs, symnum);
+        fsa->value = syms->start[symnum].value;
+        return true;
+}
+/* Find a symbol, return value, (optional) crc and (optional) module
+ * which owns it */
+static unsigned long find_symbol(const char *name,
+                                 struct module **owner,
+                                 const unsigned long **crc,
+                                 bool gplok,
+                                 bool warn)
+{
+        struct find_symbol_arg fsa;
+        fsa.name = name;
+        fsa.gplok = gplok;
+        fsa.warn = warn;
+        if (each_symbol(find_symbol_in_section, &fsa)) {
+                if (owner)
+                        *owner = fsa.owner;
+                if (crc)
+                        *crc = fsa.crc;
+                return fsa.value;
+        }
        DEBUGP("Failed to find symbol %s\n", name);
        return -ENOENT;
 }
@@ -639,8 +662,8 @@ static int __try_stop_module(void *_sref)
 {
        struct stopref *sref = _sref;
-        /* If it's not unused, quit unless we are told to block. */
+        /* If it's not unused, quit unless we're forcing. */
-        if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {
+        if (module_refcount(sref->mod) != 0) {
                if (!(*sref->forced = try_force_unload(sref->flags)))
                        return -EWOULDBLOCK;
        }
@@ -652,9 +675,16 @@ static int __try_stop_module(void *_sref)
 static int try_stop_module(struct module *mod, int flags, int *forced)
 {
-        struct stopref sref = { mod, flags, forced };
+        if (flags & O_NONBLOCK) {
+                struct stopref sref = { mod, flags, forced };
-        return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+                return stop_machine(__try_stop_module, &sref, NULL);
+        } else {
+                /* We don't need to stop the machine for this. */
+                mod->state = MODULE_STATE_GOING;
+                synchronize_sched();
+                return 0;
+        }
 }
 unsigned int module_refcount(struct module *mod)
@@ -1386,7 +1416,7 @@ static int __unlink_module(void *_mod)
 static void free_module(struct module *mod)
 {
        /* Delete from various lists */
-        stop_machine_run(__unlink_module, mod, NR_CPUS);
+        stop_machine(__unlink_module, mod, NULL);
        remove_notes_attrs(mod);
        remove_sect_attrs(mod);
        mod_kobject_remove(mod);
@@ -1445,8 +1475,10 @@ static int verify_export_symbols(struct module *mod)
                { mod->syms, mod->num_syms },
                { mod->gpl_syms, mod->num_gpl_syms },
                { mod->gpl_future_syms, mod->num_gpl_future_syms },
+#ifdef CONFIG_UNUSED_SYMBOLS
                { mod->unused_syms, mod->num_unused_syms },
                { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
+#endif
        };
        for (i = 0; i < ARRAY_SIZE(arr); i++) {
@@ -1526,7 +1558,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 }
 /* Update size with this section: return offset. */
-static long get_offset(unsigned long *size, Elf_Shdr *sechdr)
+static long get_offset(unsigned int *size, Elf_Shdr *sechdr)
 {
        long ret;
@@ -1659,6 +1691,19 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 }
 #ifdef CONFIG_KALLSYMS
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+        const struct kernel_symbol *start,
+        const struct kernel_symbol *stop)
+{
+        const struct kernel_symbol *ks = start;
+        for (; ks < stop; ks++)
+                if (strcmp(ks->name, name) == 0)
+                        return ks;
+        return NULL;
+}
 static int is_exported(const char *name, const struct module *mod)
 {
        if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
@@ -1738,6 +1783,20 @@ static inline void add_kallsyms(struct module *mod,
 }
 #endif /* CONFIG_KALLSYMS */
+static void *module_alloc_update_bounds(unsigned long size)
+{
+        void *ret = module_alloc(size);
+        if (ret) {
+                /* Update module bounds. */
+                if ((unsigned long)ret < module_addr_min)
+                        module_addr_min = (unsigned long)ret;
+                if ((unsigned long)ret + size > module_addr_max)
+                        module_addr_max = (unsigned long)ret + size;
+        }
+        return ret;
+}
 /* Allocate and load the module: note that size of section 0 is always
   zero, and we rely on this for optional sections. */
 static struct module *load_module(void __user *umod,
@@ -1764,10 +1823,12 @@ static struct module *load_module(void __user *umod,
        unsigned int gplfutureindex;
        unsigned int gplfuturecrcindex;
        unsigned int unwindex = 0;
+#ifdef CONFIG_UNUSED_SYMBOLS
        unsigned int unusedindex;
        unsigned int unusedcrcindex;
        unsigned int unusedgplindex;
        unsigned int unusedgplcrcindex;
+#endif
        unsigned int markersindex;
        unsigned int markersstringsindex;
        struct module *mod;
@@ -1850,13 +1911,15 @@ static struct module *load_module(void __user *umod,
        exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
        gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
        gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
-        unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
-        unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
        crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
        gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
        gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
+#ifdef CONFIG_UNUSED_SYMBOLS
+        unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
+        unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
        unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
        unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
+#endif
        setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
        exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
        obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
@@ -1935,7 +1998,7 @@ static struct module *load_module(void __user *umod,
        layout_sections(mod, hdr, sechdrs, secstrings);
        /* Do the allocs. */
-        ptr = module_alloc(mod->core_size);
+        ptr = module_alloc_update_bounds(mod->core_size);
        if (!ptr) {
                err = -ENOMEM;
                goto free_percpu;
@@ -1943,7 +2006,7 @@ static struct module *load_module(void __user *umod,
        memset(ptr, 0, mod->core_size);
        mod->module_core = ptr;
-        ptr = module_alloc(mod->init_size);
+        ptr = module_alloc_update_bounds(mod->init_size);
        if (!ptr && mod->init_size) {
                err = -ENOMEM;
                goto free_core;
@@ -2018,14 +2081,15 @@ static struct module *load_module(void __user *umod,
                mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
        mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
                                        sizeof(*mod->gpl_future_syms);
-        mod->num_unused_syms = sechdrs[unusedindex].sh_size /
-                                        sizeof(*mod->unused_syms);
-        mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
-                                        sizeof(*mod->unused_gpl_syms);
        mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
        if (gplfuturecrcindex)
                mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
+#ifdef CONFIG_UNUSED_SYMBOLS
+        mod->num_unused_syms = sechdrs[unusedindex].sh_size /
+                                        sizeof(*mod->unused_syms);
+        mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
+                                        sizeof(*mod->unused_gpl_syms);
        mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
        if (unusedcrcindex)
                mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
@@ -2033,13 +2097,17 @@ static struct module *load_module(void __user *umod,
        if (unusedgplcrcindex)
                mod->unused_gpl_crcs
                        = (void *)sechdrs[unusedgplcrcindex].sh_addr;
+#endif
 #ifdef CONFIG_MODVERSIONS
-        if ((mod->num_syms && !crcindex) ||
+        if ((mod->num_syms && !crcindex)
-            (mod->num_gpl_syms && !gplcrcindex) ||
+            || (mod->num_gpl_syms && !gplcrcindex)
-            (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
+            || (mod->num_gpl_future_syms && !gplfuturecrcindex)
-            (mod->num_unused_syms && !unusedcrcindex) ||
+#ifdef CONFIG_UNUSED_SYMBOLS
-            (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
+            || (mod->num_unused_syms && !unusedcrcindex)
+            || (mod->num_unused_gpl_syms && !unusedgplcrcindex)
+#endif
+                ) {
                printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
                err = try_to_force_load(mod, "nocrc");
                if (err)
@@ -2129,7 +2197,7 @@ static struct module *load_module(void __user *umod,
        /* Now sew it into the lists so we can get lockdep and oops
         * info during argument parsing.  Noone should access us, since
         * strong_try_module_get() will fail. */
-        stop_machine_run(__link_module, mod, NR_CPUS);
+        stop_machine(__link_module, mod, NULL);
        /* Size of section 0 is 0, so this works well if no params */
        err = parse_args(mod->name, mod->args,
@@ -2163,7 +2231,7 @@ static struct module *load_module(void __user *umod,
        return mod;
 unlink:
-        stop_machine_run(__unlink_module, mod, NR_CPUS);
+        stop_machine(__unlink_module, mod, NULL);
        module_arch_cleanup(mod);
 cleanup:
        kobject_del(&mod->mkobj.kobj);
@@ -2512,7 +2580,7 @@ static int m_show(struct seq_file *m, void *p)
        struct module *mod = list_entry(p, struct module, list);
        char buf[8];
-        seq_printf(m, "%s %lu",
+        seq_printf(m, "%s %u",
                   mod->name, mod->init_size + mod->core_size);
        print_unload_info(m, mod);
@@ -2595,6 +2663,9 @@ struct module *__module_text_address(unsigned long addr)
 {
        struct module *mod;
+        if (addr < module_addr_min || addr > module_addr_max)
+                return NULL;
        list_for_each_entry(mod, &modules, list)
                if (within(addr, mod->module_init, mod->init_text_size)
                    || within(addr, mod->module_core, mod->core_text_size))
diff --git a/kernel/mutex.c b/kernel/mutex.c
index bcdc9ac8ef60..12c779dc65d4 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -34,6 +34,7 @@
 /***
 * mutex_init - initialize the mutex
 * @lock: the mutex to be initialized
+ * @key: the lock_class_key for the class; used by mutex lock debugging
 *
 * Initialize the mutex to unlocked state.
 *
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 48d7ed6fc3a4..43c2111cd54d 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
+#include <linux/proc_fs.h>
 #include <linux/slab.h>
 #include <linux/nsproxy.h>
@@ -24,9 +25,12 @@ static inline struct ns_cgroup *cgroup_to_ns(
                            struct ns_cgroup, css);
 }
-int ns_cgroup_clone(struct task_struct *task)
+int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 {
-        return cgroup_clone(task, &ns_subsys);
+        char name[PROC_NUMBUF];
+        snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
+        return cgroup_clone(task, &ns_subsys, name);
 }
 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index adc785146a1c..21575fc46d05 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -157,12 +157,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
                goto out;
        }
-        err = ns_cgroup_clone(tsk);
-        if (err) {
-                put_nsproxy(new_ns);
-                goto out;
-        }
        tsk->nsproxy = new_ns;
 out:
@@ -209,7 +203,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
                goto out;
        }
-        err = ns_cgroup_clone(current);
+        err = ns_cgroup_clone(current, task_pid(current));
        if (err)
                put_nsproxy(*new_nsp);
diff --git a/kernel/panic.c b/kernel/panic.c
index 425567f45b9f..12c5a0a6c89b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -318,6 +318,28 @@ void warn_on_slowpath(const char *file, int line)
        add_taint(TAINT_WARN);
 }
 EXPORT_SYMBOL(warn_on_slowpath);
+void warn_slowpath(const char *file, int line, const char *fmt, ...)
+{
+        va_list args;
+        char function[KSYM_SYMBOL_LEN];
+        unsigned long caller = (unsigned long)__builtin_return_address(0);
+        sprint_symbol(function, caller);
+        printk(KERN_WARNING "------------[ cut here ]------------\n");
+        printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
+                line, function);
+        va_start(args, fmt);
+        vprintk(fmt, args);
+        va_end(args);
+        print_modules();
+        dump_stack();
+        print_oops_end_marker();
+        add_taint(TAINT_WARN);
+}
+EXPORT_SYMBOL(warn_slowpath);
 #endif
 #ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/pid.c b/kernel/pid.c
index 30bd5d4b2ac7..064e76afa507 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -309,12 +309,6 @@ struct pid *find_vpid(int nr)
 }
 EXPORT_SYMBOL_GPL(find_vpid);
-struct pid *find_pid(int nr)
-{
-        return find_pid_ns(nr, &init_pid_ns);
-}
-EXPORT_SYMBOL_GPL(find_pid);
 /*
 * attach_pid() must be called with the tasklist_lock write-held.
 */
@@ -435,6 +429,7 @@ struct pid *find_get_pid(pid_t nr)
        return pid;
 }
+EXPORT_SYMBOL_GPL(find_get_pid);
 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
 {
@@ -482,7 +477,7 @@ EXPORT_SYMBOL(task_session_nr_ns);
 /*
 * Used by proc to find the first pid that is greater then or equal to nr.
 *
- * If there is a pid at nr this function is exactly the same as find_pid.
+ * If there is a pid at nr this function is exactly the same as find_pid_ns.
 */
 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 {
@@ -497,7 +492,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
        return pid;
 }
-EXPORT_SYMBOL_GPL(find_get_pid);
 /*
 * The pid hash table is scaled according to the amount of memory in the
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 98702b4b8851..ea567b78d1aa 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,6 +12,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
+#include <linux/acct.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -71,7 +72,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
        struct pid_namespace *ns;
        int i;
-        ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
+        ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
        if (ns == NULL)
                goto out;
@@ -84,17 +85,13 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
                goto out_free_map;
        kref_init(&ns->kref);
-        ns->last_pid = 0;
-        ns->child_reaper = NULL;
        ns->level = level;
        set_bit(0, ns->pidmap[0].page);
        atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-        for (i = 1; i < PIDMAP_ENTRIES; i++) {
+        for (i = 1; i < PIDMAP_ENTRIES; i++)
-                ns->pidmap[i].page = NULL;
                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-        }
        return ns;
@@ -185,6 +182,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        /* Child reaper for the pid namespace is going away */
        pid_ns->child_reaper = NULL;
+        acct_exit_ns(pid_ns);
        return;
 }
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 8cb757026386..da9c2dda6a4e 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -24,7 +24,7 @@
 * requirement that the application has is cleaned up when closes the file
 * pointer or exits the pm_qos_object will get an opportunity to clean up.
 *
- * mark gross mgross@linux.intel.com
+ * Mark Gross <mgross@linux.intel.com>
 */
 #include <linux/pm_qos_params.h>
@@ -211,8 +211,8 @@ EXPORT_SYMBOL_GPL(pm_qos_requirement);
 * @value: defines the qos request
 *
 * This function inserts a new entry in the pm_qos_class list of requested qos
- * performance charactoistics.  It recomputes the agregate QoS expectations for
+ * performance characteristics.  It recomputes the aggregate QoS expectations
- * the pm_qos_class of parrameters.
+ * for the pm_qos_class of parameters.
 */
 int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
 {
@@ -250,10 +250,10 @@ EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
 * @name: identifies the request
 * @value: defines the qos request
 *
- * Updates an existing qos requierement for the pm_qos_class of parameters along
+ * Updates an existing qos requirement for the pm_qos_class of parameters along
 * with updating the target pm_qos_class value.
 *
- * If the named request isn't in the lest then no change is made.
+ * If the named request isn't in the list then no change is made.
 */
 int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
 {
@@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
 * @pm_qos_class: identifies which list of qos request to us
 * @name: identifies the request
 *
- * Will remove named qos request from pm_qos_class list of parrameters and
+ * Will remove named qos request from pm_qos_class list of parameters and
 * recompute the current target value for the pm_qos_class.
 */
 void pm_qos_remove_requirement(int pm_qos_class, char *name)
@@ -319,7 +319,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
 * @notifier: notifier block managed by caller.
 *
 * will register the notifier into a notification chain that gets called
- * uppon changes to the pm_qos_class target value.
+ * upon changes to the pm_qos_class target value.
 */
 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
 {
@@ -338,7 +338,7 @@ EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
 * @notifier: notifier block to be removed.
 *
 * will remove the notifier from the notification chain that gets called
- * uppon changes to the pm_qos_class target value.
+ * upon changes to the pm_qos_class target value.
 */
 int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 {
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dbd8398ddb0b..9a21681aa80f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -449,9 +449,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
                spin_unlock_irqrestore(&idr_lock, flags);
        }
        sigqueue_free(tmr->sigq);
-        if (unlikely(tmr->it_process) &&
-            tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-                put_task_struct(tmr->it_process);
        kmem_cache_free(posix_timers_cache, tmr);
 }
@@ -856,11 +853,10 @@ retry_delete:
         * This keeps any tasks waiting on the spin lock from thinking
         * they got something (see the lock code above).
         */
-        if (timer->it_process) {
+        if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-                if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+                put_task_struct(timer->it_process);
-                        put_task_struct(timer->it_process);
+        timer->it_process = NULL;
-                timer->it_process = NULL;
-        }
        unlock_timer(timer, flags);
        release_posix_timer(timer, IT_ID_SET);
        return 0;
@@ -885,11 +881,10 @@ retry_delete:
         * This keeps any tasks waiting on the spin lock from thinking
         * they got something (see the lock code above).
         */
-        if (timer->it_process) {
+        if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-                if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+                put_task_struct(timer->it_process);
-                        put_task_struct(timer->it_process);
+        timer->it_process = NULL;
-                timer->it_process = NULL;
-        }
        unlock_timer(timer, flags);
        release_posix_timer(timer, IT_ID_SET);
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index b45da40e8d25..dcd165f92a88 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
 config PM_SLEEP
        bool
-        depends on SUSPEND || HIBERNATION
+        depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
        default y
 config SUSPEND
@@ -94,6 +94,17 @@ config SUSPEND
          powered and thus its contents are preserved, such as the
          suspend-to-RAM state (e.g. the ACPI S3 state).
+config PM_TEST_SUSPEND
+        bool "Test suspend/resume and wakealarm during bootup"
+        depends on SUSPEND && PM_DEBUG && RTC_LIB=y
+        ---help---
+        This option will let you suspend your machine during bootup, and
+        make it wake up a few seconds later using an RTC wakeup alarm.
+        Enable this with a kernel parameter like "test_suspend=mem".
+        You probably want to have your system's RTC driver statically
+        linked, ensuring that it's available when this test runs.
 config SUSPEND_FREEZER
        bool "Enable freezer for suspend to RAM/standby" \
                if ARCH_WANTS_FREEZER_CONTROL || BROKEN
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3398f4651aa1..0b7476f5d2a6 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -132,6 +132,61 @@ static inline int suspend_test(int level) { return 0; }
 #ifdef CONFIG_SUSPEND
+#ifdef CONFIG_PM_TEST_SUSPEND
+/*
+ * We test the system suspend code by setting an RTC wakealarm a short
+ * time in the future, then suspending.  Suspending the devices won't
+ * normally take long ... some systems only need a few milliseconds.
+ *
+ * The time it takes is system-specific though, so when we test this
+ * during system bootup we allow a LOT of time.
+ */
+#define TEST_SUSPEND_SECONDS    5
+static unsigned long suspend_test_start_time;
+static void suspend_test_start(void)
+{
+        /* FIXME Use better timebase than "jiffies", ideally a clocksource.
+         * What we want is a hardware counter that will work correctly even
+         * during the irqs-are-off stages of the suspend/resume cycle...
+         */
+        suspend_test_start_time = jiffies;
+}
+static void suspend_test_finish(const char *label)
+{
+        long nj = jiffies - suspend_test_start_time;
+        unsigned msec;
+        msec = jiffies_to_msecs(abs(nj));
+        pr_info("PM: %s took %d.%03d seconds\n", label,
+                        msec / 1000, msec % 1000);
+        /* Warning on suspend means the RTC alarm period needs to be
+         * larger -- the system was sooo slooowwww to suspend that the
+         * alarm (should have) fired before the system went to sleep!
+         *
+         * Warning on either suspend or resume also means the system
+         * has some performance issues.  The stack dump of a WARN_ON
+         * is more likely to get the right attention than a printk...
+         */
+        WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000));
+}
+#else
+static void suspend_test_start(void)
+{
+}
+static void suspend_test_finish(const char *label)
+{
+}
+#endif
 /* This is just an arbitrary number */
 #define FREE_PAGE_NUMBER (100)
@@ -266,12 +321,13 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
+        suspend_test_start();
        error = device_suspend(PMSG_SUSPEND);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to suspend\n");
                goto Recover_platform;
        }
+        suspend_test_finish("suspend devices");
        if (suspend_test(TEST_DEVICES))
                goto Recover_platform;
@@ -293,7 +349,9 @@ int suspend_devices_and_enter(suspend_state_t state)
        if (suspend_ops->finish)
                suspend_ops->finish();
 Resume_devices:
+        suspend_test_start();
        device_resume(PMSG_RESUME);
+        suspend_test_finish("resume devices");
        resume_console();
 Close:
        if (suspend_ops->end)
@@ -521,3 +579,144 @@ static int __init pm_init(void)
 }
 core_initcall(pm_init);
+#ifdef CONFIG_PM_TEST_SUSPEND
+#include <linux/rtc.h>
+/*
+ * To test system suspend, we need a hands-off mechanism to resume the
+ * system.  RTCs wake alarms are a common self-contained mechanism.
+ */
+static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
+{
+        static char err_readtime[] __initdata =
+                KERN_ERR "PM: can't read %s time, err %d\n";
+        static char err_wakealarm [] __initdata =
+                KERN_ERR "PM: can't set %s wakealarm, err %d\n";
+        static char err_suspend[] __initdata =
+                KERN_ERR "PM: suspend test failed, error %d\n";
+        static char info_test[] __initdata =
+                KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
+        unsigned long           now;
+        struct rtc_wkalrm       alm;
+        int                     status;
+        /* this may fail if the RTC hasn't been initialized */
+        status = rtc_read_time(rtc, &alm.time);
+        if (status < 0) {
+                printk(err_readtime, rtc->dev.bus_id, status);
+                return;
+        }
+        rtc_tm_to_time(&alm.time, &now);
+        memset(&alm, 0, sizeof alm);
+        rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+        alm.enabled = true;
+        status = rtc_set_alarm(rtc, &alm);
+        if (status < 0) {
+                printk(err_wakealarm, rtc->dev.bus_id, status);
+                return;
+        }
+        if (state == PM_SUSPEND_MEM) {
+                printk(info_test, pm_states[state]);
+                status = pm_suspend(state);
+                if (status == -ENODEV)
+                        state = PM_SUSPEND_STANDBY;
+        }
+        if (state == PM_SUSPEND_STANDBY) {
+                printk(info_test, pm_states[state]);
+                status = pm_suspend(state);
+        }
+        if (status < 0)
+                printk(err_suspend, status);
+        /* Some platforms can't detect that the alarm triggered the
+         * wakeup, or (accordingly) disable it after it afterwards.
+         * It's supposed to give oneshot behavior; cope.
+         */
+        alm.enabled = false;
+        rtc_set_alarm(rtc, &alm);
+}
+static int __init has_wakealarm(struct device *dev, void *name_ptr)
+{
+        struct rtc_device *candidate = to_rtc_device(dev);
+        if (!candidate->ops->set_alarm)
+                return 0;
+        if (!device_may_wakeup(candidate->dev.parent))
+                return 0;
+        *(char **)name_ptr = dev->bus_id;
+        return 1;
+}
+/*
+ * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
+ * at startup time.  They're normally disabled, for faster boot and because
+ * we can't know which states really work on this particular system.
+ */
+static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+static char warn_bad_state[] __initdata =
+        KERN_WARNING "PM: can't test '%s' suspend state\n";
+static int __init setup_test_suspend(char *value)
+{
+        unsigned i;
+        /* "=mem" ==> "mem" */
+        value++;
+        for (i = 0; i < PM_SUSPEND_MAX; i++) {
+                if (!pm_states[i])
+                        continue;
+                if (strcmp(pm_states[i], value) != 0)
+                        continue;
+                test_state = (__force suspend_state_t) i;
+                return 0;
+        }
+        printk(warn_bad_state, value);
+        return 0;
+}
+__setup("test_suspend", setup_test_suspend);
+static int __init test_suspend(void)
+{
+        static char             warn_no_rtc[] __initdata =
+                KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
+        char                    *pony = NULL;
+        struct rtc_device       *rtc = NULL;
+        /* PM is initialized by now; is that state testable? */
+        if (test_state == PM_SUSPEND_ON)
+                goto done;
+        if (!valid_state(test_state)) {
+                printk(warn_bad_state, pm_states[test_state]);
+                goto done;
+        }
+        /* RTCs have initialized by now too ... can we use one? */
+        class_find_device(rtc_class, NULL, &pony, has_wakealarm);
+        if (pony)
+                rtc = rtc_class_open(pony);
+        if (!rtc) {
+                printk(warn_no_rtc);
+                goto done;
+        }
+        /* go for it */
+        test_wakealarm(rtc, test_state);
+        rtc_class_close(rtc);
+done:
+        return 0;
+}
+late_initcall(test_suspend);
+#endif /* CONFIG_PM_TEST_SUSPEND */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 700f44ec8406..acc0c101dbd5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,8 +53,6 @@ extern int hibernation_platform_enter(void);
 extern int pfn_is_nosave(unsigned long);
-extern struct mutex pm_mutex;
 #define power_attr(_name) \
 static struct kobj_attribute _name##_attr = {   \
        .attr   = {                             \
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 678ec736076b..72016f051477 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -10,6 +10,7 @@
 #include <linux/pm.h>
 #include <linux/workqueue.h>
 #include <linux/reboot.h>
+#include <linux/cpumask.h>
 /*
 * When the user hits Sys-Rq o to power down the machine this is the
@@ -25,7 +26,8 @@ static DECLARE_WORK(poweroff_work, do_poweroff);
 static void handle_poweroff(int key, struct tty_struct *tty)
 {
-        schedule_work(&poweroff_work);
+        /* run sysrq poweroff on boot cpu */
+        schedule_work_on(first_cpu(cpu_online_map), &poweroff_work);
 }
 static struct sysrq_key_op      sysrq_poweroff_op = {
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5fb87652f214..278946aecaf0 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -149,7 +149,7 @@ static int try_to_freeze_tasks(bool sig_only)
        unsigned long end_time;
        unsigned int todo;
        struct timeval start, end;
-        s64 elapsed_csecs64;
+        u64 elapsed_csecs64;
        unsigned int elapsed_csecs;
        do_gettimeofday(&start);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5f91a07c4eac..5d2ab836e998 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -205,8 +205,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
 *      objects.  The main list's elements are of type struct zone_bitmap
 *      and each of them corresonds to one zone.  For each zone bitmap
 *      object there is a list of objects of type struct bm_block that
- *      represent each blocks of bit chunks in which information is
+ *      represent each blocks of bitmap in which information is stored.
- *      stored.
 *
 *      struct memory_bitmap contains a pointer to the main list of zone
 *      bitmap objects, a struct bm_position used for browsing the bitmap,
@@ -224,26 +223,27 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
 *      pfns that correspond to the start and end of the represented zone.
 *
 *      struct bm_block contains a pointer to the memory page in which
- *      information is stored (in the form of a block of bit chunks
+ *      information is stored (in the form of a block of bitmap)
- *      of type unsigned long each).  It also contains the pfns that
+ *      It also contains the pfns that correspond to the start and end of
- *      correspond to the start and end of the represented memory area and
+ *      the represented memory area.
- *      the number of bit chunks in the block.
 */
 #define BM_END_OF_MAP   (~0UL)
-#define BM_CHUNKS_PER_BLOCK     (PAGE_SIZE / sizeof(long))
-#define BM_BITS_PER_CHUNK       (sizeof(long) << 3)
 #define BM_BITS_PER_BLOCK       (PAGE_SIZE << 3)
 struct bm_block {
        struct bm_block *next;          /* next element of the list */
        unsigned long start_pfn;        /* pfn represented by the first bit */
        unsigned long end_pfn;  /* pfn represented by the last bit plus 1 */
-        unsigned int size;      /* number of bit chunks */
+        unsigned long *data;    /* bitmap representing pages */
-        unsigned long *data;    /* chunks of bits representing pages */
 };
+static inline unsigned long bm_block_bits(struct bm_block *bb)
+{
+        return bb->end_pfn - bb->start_pfn;
+}
 struct zone_bitmap {
        struct zone_bitmap *next;       /* next element of the list */
        unsigned long start_pfn;        /* minimal pfn in this zone */
@@ -257,7 +257,6 @@ struct zone_bitmap {
 struct bm_position {
        struct zone_bitmap *zone_bm;
        struct bm_block *block;
-        int chunk;
        int bit;
 };
@@ -272,12 +271,6 @@ struct memory_bitmap {
 /* Functions that operate on memory bitmaps */
-static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
-{
-        bm->cur.chunk = 0;
-        bm->cur.bit = -1;
-}
 static void memory_bm_position_reset(struct memory_bitmap *bm)
 {
        struct zone_bitmap *zone_bm;
@@ -285,7 +278,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm)
        zone_bm = bm->zone_bm_list;
        bm->cur.zone_bm = zone_bm;
        bm->cur.block = zone_bm->bm_blocks;
-        memory_bm_reset_chunk(bm);
+        bm->cur.bit = 0;
 }
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
@@ -394,12 +387,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
                        bb->start_pfn = pfn;
                        if (nr >= BM_BITS_PER_BLOCK) {
                                pfn += BM_BITS_PER_BLOCK;
-                                bb->size = BM_CHUNKS_PER_BLOCK;
                                nr -= BM_BITS_PER_BLOCK;
                        } else {
                                /* This is executed only once in the loop */
                                pfn += nr;
-                                bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
                        }
                        bb->end_pfn = pfn;
                        bb = bb->next;
@@ -478,8 +469,8 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
        }
        zone_bm->cur_block = bb;
        pfn -= bb->start_pfn;
-        *bit_nr = pfn % BM_BITS_PER_CHUNK;
+        *bit_nr = pfn;
-        *addr = bb->data + pfn / BM_BITS_PER_CHUNK;
+        *addr = bb->data;
        return 0;
 }
@@ -528,36 +519,6 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
        return test_bit(bit, addr);
 }
-/* Two auxiliary functions for memory_bm_next_pfn */
-/* Find the first set bit in the given chunk, if there is one */
-static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
-{
-        bit++;
-        while (bit < BM_BITS_PER_CHUNK) {
-                if (test_bit(bit, chunk_p))
-                        return bit;
-                bit++;
-        }
-        return -1;
-}
-/* Find a chunk containing some bits set in given block of bits */
-static inline int next_chunk_in_block(int n, struct bm_block *bb)
-{
-        n++;
-        while (n < bb->size) {
-                if (bb->data[n])
-                        return n;
-                n++;
-        }
-        return -1;
-}
 /**
 *      memory_bm_next_pfn - find the pfn that corresponds to the next set bit
 *      in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is
@@ -571,40 +532,33 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 {
        struct zone_bitmap *zone_bm;
        struct bm_block *bb;
-        int chunk;
        int bit;
        do {
                bb = bm->cur.block;
                do {
-                        chunk = bm->cur.chunk;
                        bit = bm->cur.bit;
-                        do {
+                        bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
-                                bit = next_bit_in_chunk(bit, bb->data + chunk);
+                        if (bit < bm_block_bits(bb))
-                                if (bit >= 0)
+                                goto Return_pfn;
-                                        goto Return_pfn;
-                                chunk = next_chunk_in_block(chunk, bb);
-                                bit = -1;
-                        } while (chunk >= 0);
                        bb = bb->next;
                        bm->cur.block = bb;
-                        memory_bm_reset_chunk(bm);
+                        bm->cur.bit = 0;
                } while (bb);
                zone_bm = bm->cur.zone_bm->next;
                if (zone_bm) {
                        bm->cur.zone_bm = zone_bm;
                        bm->cur.block = zone_bm->bm_blocks;
-                        memory_bm_reset_chunk(bm);
+                        bm->cur.bit = 0;
                }
        } while (zone_bm);
        memory_bm_position_reset(bm);
        return BM_END_OF_MAP;
 Return_pfn:
-        bm->cur.chunk = chunk;
+        bm->cur.bit = bit + 1;
-        bm->cur.bit = bit;
+        return bb->start_pfn + bit;
-        return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
 }
 /**
diff --git a/kernel/printk.c b/kernel/printk.c
index 07ad9e7f7a66..b51b1567bb55 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -933,7 +933,7 @@ void suspend_console(void)
 {
        if (!console_suspend_enabled)
                return;
-        printk("Suspending console(s)\n");
+        printk("Suspending console(s) (use no_console_suspend to debug)\n");
        acquire_console_sem();
        console_suspended = 1;
 }
@@ -1308,29 +1308,18 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 }
 #if defined CONFIG_PRINTK
 /*
 * printk rate limiting, lifted from the networking subsystem.
 *
- * This enforces a rate limit: not more than one kernel message
+ * This enforces a rate limit: not more than 10 kernel messages
- * every printk_ratelimit_jiffies to make a denial-of-service
+ * every 5s to make a denial-of-service attack impossible.
- * attack impossible.
 */
-int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
-{
-        return __ratelimit(ratelimit_jiffies, ratelimit_burst);
-}
-EXPORT_SYMBOL(__printk_ratelimit);
-/* minimum time in jiffies between messages */
-int printk_ratelimit_jiffies = 5 * HZ;
-/* number of messages we send before ratelimiting */
-int printk_ratelimit_burst = 10;
 int printk_ratelimit(void)
 {
-        return __printk_ratelimit(printk_ratelimit_jiffies,
+        return __ratelimit(&printk_ratelimit_state);
-                                printk_ratelimit_burst);
 }
 EXPORT_SYMBOL(printk_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
index 58926411eb2a..cd26bed4cc26 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -112,8 +112,6 @@ void __init profile_init(void)
 /* Profile event notifications */
-#ifdef CONFIG_PROFILING
 static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
 static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
 static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
@@ -203,8 +201,6 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
 }
 EXPORT_SYMBOL_GPL(unregister_timer_hook);
-#endif /* CONFIG_PROFILING */
 #ifdef CONFIG_SMP
 /*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8392a9da6450..082b3fcb32a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
        read_unlock(&tasklist_lock);
        if (!ret && !kill)
-                wait_task_inactive(child);
+                ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
        /* All systems go.. */
        return ret;
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index d3553ee55f64..d4271146a9bd 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -93,8 +93,8 @@ static void force_quiescent_state(struct rcu_data *rdp,
                 * rdp->cpu is the current cpu.
                 *
                 * cpu_online_map is updated by the _cpu_down()
-                 * using stop_machine_run(). Since we're in irqs disabled
+                 * using __stop_machine(). Since we're in irqs disabled
-                 * section, stop_machine_run() is not exectuting, hence
+                 * section, __stop_machine() is not exectuting, hence
                 * the cpu_online_map is stable.
                 *
                 * However,  a cpu might have been offlined _just_ before
@@ -108,7 +108,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
                 */
                cpus_and(cpumask, rcp->cpumask, cpu_online_map);
                cpu_clear(rdp->cpu, cpumask);
-                for_each_cpu_mask(cpu, cpumask)
+                for_each_cpu_mask_nr(cpu, cpumask)
                        smp_send_reschedule(cpu);
        }
 }
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 6f62b77d93c4..27827931ca0d 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -756,7 +756,7 @@ rcu_try_flip_idle(void)
        /* Now ask each CPU for acknowledgement of the flip. */
-        for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
                per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
                dyntick_save_progress_counter(cpu);
        }
@@ -774,7 +774,7 @@ rcu_try_flip_waitack(void)
        int cpu;
        RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
-        for_each_cpu_mask(cpu, rcu_cpu_online_map)
+        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
                if (rcu_try_flip_waitack_needed(cpu) &&
                    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
                        RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -806,7 +806,7 @@ rcu_try_flip_waitzero(void)
        /* Check to see if the sum of the "last" counters is zero. */
        RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
-        for_each_cpu_mask(cpu, rcu_cpu_online_map)
+        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
                sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
        if (sum != 0) {
                RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -821,7 +821,7 @@ rcu_try_flip_waitzero(void)
        smp_mb();  /*  ^^^^^^^^^^^^ */
        /* Call for a memory barrier from each CPU. */
-        for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
                per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
                dyntick_save_progress_counter(cpu);
        }
@@ -841,7 +841,7 @@ rcu_try_flip_waitmb(void)
        int cpu;
        RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
-        for_each_cpu_mask(cpu, rcu_cpu_online_map)
+        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
                if (rcu_try_flip_waitmb_needed(cpu) &&
                    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
                        RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
diff --git a/kernel/relay.c b/kernel/relay.c
index 7de644cdec43..8d13a7855c08 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -407,6 +407,35 @@ void relay_reset(struct rchan *chan)
 }
 EXPORT_SYMBOL_GPL(relay_reset);
+static inline void relay_set_buf_dentry(struct rchan_buf *buf,
+                                        struct dentry *dentry)
+{
+        buf->dentry = dentry;
+        buf->dentry->d_inode->i_size = buf->early_bytes;
+}
+static struct dentry *relay_create_buf_file(struct rchan *chan,
+                                            struct rchan_buf *buf,
+                                            unsigned int cpu)
+{
+        struct dentry *dentry;
+        char *tmpname;
+        tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+        if (!tmpname)
+                return NULL;
+        snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
+        /* Create file in fs */
+        dentry = chan->cb->create_buf_file(tmpname, chan->parent,
+                                           S_IRUSR, buf,
+                                           &chan->is_global);
+        kfree(tmpname);
+        return dentry;
+}
 /*
 *      relay_open_buf - create a new relay channel buffer
 *
@@ -416,45 +445,34 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
 {
        struct rchan_buf *buf = NULL;
        struct dentry *dentry;
-        char *tmpname;
        if (chan->is_global)
                return chan->buf[0];
-        tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
-        if (!tmpname)
-                goto end;
-        snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
        buf = relay_create_buf(chan);
        if (!buf)
-                goto free_name;
+                return NULL;
+        if (chan->has_base_filename) {
+                dentry = relay_create_buf_file(chan, buf, cpu);
+                if (!dentry)
+                        goto free_buf;
+                relay_set_buf_dentry(buf, dentry);
+        }
        buf->cpu = cpu;
        __relay_reset(buf, 1);
-        /* Create file in fs */
-        dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR,
-                                           buf, &chan->is_global);
-        if (!dentry)
-                goto free_buf;
-        buf->dentry = dentry;
        if(chan->is_global) {
                chan->buf[0] = buf;
                buf->cpu = 0;
        }
-        goto free_name;
+        return buf;
 free_buf:
        relay_destroy_buf(buf);
-        buf = NULL;
+        return NULL;
-free_name:
-        kfree(tmpname);
-end:
-        return buf;
 }
 /**
@@ -537,8 +555,8 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
 /**
 *      relay_open - create a new relay channel
- *      @base_filename: base name of files to create
+ *      @base_filename: base name of files to create, %NULL for buffering only
- *      @parent: dentry of parent directory, %NULL for root directory
+ *      @parent: dentry of parent directory, %NULL for root directory or buffer
 *      @subbuf_size: size of sub-buffers
 *      @n_subbufs: number of sub-buffers
 *      @cb: client callback functions
@@ -560,8 +578,6 @@ struct rchan *relay_open(const char *base_filename,
 {
        unsigned int i;
        struct rchan *chan;
-        if (!base_filename)
-                return NULL;
        if (!(subbuf_size && n_subbufs))
                return NULL;
@@ -576,7 +592,10 @@ struct rchan *relay_open(const char *base_filename,
        chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
        chan->parent = parent;
        chan->private_data = private_data;
-        strlcpy(chan->base_filename, base_filename, NAME_MAX);
+        if (base_filename) {
+                chan->has_base_filename = 1;
+                strlcpy(chan->base_filename, base_filename, NAME_MAX);
+        }
        setup_callbacks(chan, cb);
        kref_init(&chan->kref);
@@ -604,6 +623,94 @@ free_bufs:
 }
 EXPORT_SYMBOL_GPL(relay_open);
+struct rchan_percpu_buf_dispatcher {
+        struct rchan_buf *buf;
+        struct dentry *dentry;
+};
+/* Called in atomic context. */
+static void __relay_set_buf_dentry(void *info)
+{
+        struct rchan_percpu_buf_dispatcher *p = info;
+        relay_set_buf_dentry(p->buf, p->dentry);
+}
+/**
+ *      relay_late_setup_files - triggers file creation
+ *      @chan: channel to operate on
+ *      @base_filename: base name of files to create
+ *      @parent: dentry of parent directory, %NULL for root directory
+ *
+ *      Returns 0 if successful, non-zero otherwise.
+ *
+ *      Use to setup files for a previously buffer-only channel.
+ *      Useful to do early tracing in kernel, before VFS is up, for example.
+ */
+int relay_late_setup_files(struct rchan *chan,
+                           const char *base_filename,
+                           struct dentry *parent)
+{
+        int err = 0;
+        unsigned int i, curr_cpu;
+        unsigned long flags;
+        struct dentry *dentry;
+        struct rchan_percpu_buf_dispatcher disp;
+        if (!chan || !base_filename)
+                return -EINVAL;
+        strlcpy(chan->base_filename, base_filename, NAME_MAX);
+        mutex_lock(&relay_channels_mutex);
+        /* Is chan already set up? */
+        if (unlikely(chan->has_base_filename))
+                return -EEXIST;
+        chan->has_base_filename = 1;
+        chan->parent = parent;
+        curr_cpu = get_cpu();
+        /*
+         * The CPU hotplug notifier ran before us and created buffers with
+         * no files associated. So it's safe to call relay_setup_buf_file()
+         * on all currently online CPUs.
+         */
+        for_each_online_cpu(i) {
+                if (unlikely(!chan->buf[i])) {
+                        printk(KERN_ERR "relay_late_setup_files: CPU %u "
+                                        "has no buffer, it must have!\n", i);
+                        BUG();
+                        err = -EINVAL;
+                        break;
+                }
+                dentry = relay_create_buf_file(chan, chan->buf[i], i);
+                if (unlikely(!dentry)) {
+                        err = -EINVAL;
+                        break;
+                }
+                if (curr_cpu == i) {
+                        local_irq_save(flags);
+                        relay_set_buf_dentry(chan->buf[i], dentry);
+                        local_irq_restore(flags);
+                } else {
+                        disp.buf = chan->buf[i];
+                        disp.dentry = dentry;
+                        smp_mb();
+                        /* relay_channels_mutex must be held, so wait. */
+                        err = smp_call_function_single(i,
+                                                       __relay_set_buf_dentry,
+                                                       &disp, 1);
+                }
+                if (unlikely(err))
+                        break;
+        }
+        put_cpu();
+        mutex_unlock(&relay_channels_mutex);
+        return err;
+}
 /**
 *      relay_switch_subbuf - switch to a new sub-buffer
 *      @buf: channel buffer
@@ -627,8 +734,13 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
                old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
                buf->padding[old_subbuf] = buf->prev_padding;
                buf->subbufs_produced++;
-                buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
+                if (buf->dentry)
-                        buf->padding[old_subbuf];
+                        buf->dentry->d_inode->i_size +=
+                                buf->chan->subbuf_size -
+                                buf->padding[old_subbuf];
+                else
+                        buf->early_bytes += buf->chan->subbuf_size -
+                                            buf->padding[old_subbuf];
                smp_mb();
                if (waitqueue_active(&buf->read_wait))
                        /*
@@ -832,6 +944,10 @@ static void relay_file_read_consume(struct rchan_buf *buf,
        size_t n_subbufs = buf->chan->n_subbufs;
        size_t read_subbuf;
+        if (buf->subbufs_produced == buf->subbufs_consumed &&
+            buf->offset == buf->bytes_consumed)
+                return;
        if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
                relay_subbufs_consumed(buf->chan, buf->cpu, 1);
                buf->bytes_consumed = 0;
@@ -863,6 +979,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
        relay_file_read_consume(buf, read_pos, 0);
+        consumed = buf->subbufs_consumed;
        if (unlikely(buf->offset > subbuf_size)) {
                if (produced == consumed)
                        return 0;
@@ -881,8 +999,12 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
        if (consumed > produced)
                produced += n_subbufs * subbuf_size;
-        if (consumed == produced)
+        if (consumed == produced) {
+                if (buf->offset == subbuf_size &&
+                    buf->subbufs_produced > buf->subbufs_consumed)
+                        return 1;
                return 0;
+        }
        return 1;
 }
@@ -1237,4 +1359,4 @@ static __init int relay_init(void)
        return 0;
 }
-module_init(relay_init);
+early_initcall(relay_init);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d3c61b4ebef2..f275c8eca772 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/res_counter.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 void res_counter_init(struct res_counter *counter)
 {
@@ -102,44 +103,37 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
        return *res_counter_member(counter, member);
 }
-ssize_t res_counter_write(struct res_counter *counter, int member,
+int res_counter_memparse_write_strategy(const char *buf,
-                const char __user *userbuf, size_t nbytes, loff_t *pos,
+                                        unsigned long long *res)
-                int (*write_strategy)(char *st_buf, unsigned long long *val))
 {
-        int ret;
+        char *end;
-        char *buf, *end;
+        /* FIXME - make memparse() take const char* args */
-        unsigned long flags;
+        *res = memparse((char *)buf, &end);
-        unsigned long long tmp, *val;
+        if (*end != '\0')
+                return -EINVAL;
-        buf = kmalloc(nbytes + 1, GFP_KERNEL);
-        ret = -ENOMEM;
-        if (buf == NULL)
-                goto out;
-        buf[nbytes] = '\0';
+        *res = PAGE_ALIGN(*res);
-        ret = -EFAULT;
+        return 0;
-        if (copy_from_user(buf, userbuf, nbytes))
+}
-                goto out_free;
-        ret = -EINVAL;
+int res_counter_write(struct res_counter *counter, int member,
+                      const char *buf, write_strategy_fn write_strategy)
+{
+        char *end;
+        unsigned long flags;
+        unsigned long long tmp, *val;
-        strstrip(buf);
        if (write_strategy) {
-                if (write_strategy(buf, &tmp)) {
+                if (write_strategy(buf, &tmp))
-                        goto out_free;
+                        return -EINVAL;
-                }
        } else {
                tmp = simple_strtoull(buf, &end, 10);
                if (*end != '\0')
-                        goto out_free;
+                        return -EINVAL;
        }
        spin_lock_irqsave(&counter->lock, flags);
        val = res_counter_member(counter, member);
        *val = tmp;
        spin_unlock_irqrestore(&counter->lock, flags);
-        ret = nbytes;
+        return 0;
-out_free:
-        kfree(buf);
-out:
-        return ret;
 }
diff --git a/kernel/resource.c b/kernel/resource.c
index 74af2d7cb5a1..f5b518eabefe 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -490,7 +490,7 @@ resource_size_t resource_alignment(struct resource *res)
 {
        switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
        case IORESOURCE_SIZEALIGN:
-                return res->end - res->start + 1;
+                return resource_size(res);
        case IORESOURCE_STARTALIGN:
                return res->start;
        default:
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 092e4c620af9..a56f629b057a 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -297,8 +297,8 @@ static int test_func(void *data)
 *
 * opcode:data
 */
-static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
+static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
-                                  size_t count)
+                                  const char *buf, size_t count)
 {
        struct sched_param schedpar;
        struct test_thread_data *td;
@@ -360,7 +360,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
 * @dev:        thread to query
 * @buf:        char buffer to be filled with thread status info
 */
-static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
+static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
+                                 char *buf)
 {
        struct test_thread_data *td;
        struct task_struct *tsk;
diff --git a/kernel/sched.c b/kernel/sched.c
index 99e6d850ecab..04160d277e7a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -571,8 +571,10 @@ struct rq {
 #endif
 #ifdef CONFIG_SCHED_HRTICK
-        unsigned long hrtick_flags;
+#ifdef CONFIG_SMP
-        ktime_t hrtick_expire;
+        int hrtick_csd_pending;
+        struct call_single_data hrtick_csd;
+#endif
        struct hrtimer hrtick_timer;
 #endif
@@ -983,13 +985,6 @@ static struct rq *this_rq_lock(void)
        return rq;
 }
-static void __resched_task(struct task_struct *p, int tif_bit);
-static inline void resched_task(struct task_struct *p)
-{
-        __resched_task(p, TIF_NEED_RESCHED);
-}
 #ifdef CONFIG_SCHED_HRTICK
 /*
 * Use HR-timers to deliver accurate preemption points.
@@ -1001,25 +996,6 @@ static inline void resched_task(struct task_struct *p)
 * When we get rescheduled we reprogram the hrtick_timer outside of the
 * rq->lock.
 */
-static inline void resched_hrt(struct task_struct *p)
-{
-        __resched_task(p, TIF_HRTICK_RESCHED);
-}
-static inline void resched_rq(struct rq *rq)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
-        resched_task(rq->curr);
-        spin_unlock_irqrestore(&rq->lock, flags);
-}
-enum {
-        HRTICK_SET,             /* re-programm hrtick_timer */
-        HRTICK_RESET,           /* not a new slice */
-        HRTICK_BLOCK,           /* stop hrtick operations */
-};
 /*
 * Use hrtick when:
@@ -1030,40 +1006,11 @@ static inline int hrtick_enabled(struct rq *rq)
 {
        if (!sched_feat(HRTICK))
                return 0;
-        if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
+        if (!cpu_active(cpu_of(rq)))
                return 0;
        return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-static void hrtick_start(struct rq *rq, u64 delay, int reset)
-{
-        assert_spin_locked(&rq->lock);
-        /*
-         * preempt at: now + delay
-         */
-        rq->hrtick_expire =
-                ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
-        /*
-         * indicate we need to program the timer
-         */
-        __set_bit(HRTICK_SET, &rq->hrtick_flags);
-        if (reset)
-                __set_bit(HRTICK_RESET, &rq->hrtick_flags);
-        /*
-         * New slices are called from the schedule path and don't need a
-         * forced reschedule.
-         */
-        if (reset)
-                resched_hrt(rq->curr);
-}
 static void hrtick_clear(struct rq *rq)
 {
        if (hrtimer_active(&rq->hrtick_timer))
@@ -1071,32 +1018,6 @@ static void hrtick_clear(struct rq *rq)
 }
 /*
- * Update the timer from the possible pending state.
- */
-static void hrtick_set(struct rq *rq)
-{
-        ktime_t time;
-        int set, reset;
-        unsigned long flags;
-        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-        spin_lock_irqsave(&rq->lock, flags);
-        set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
-        reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
-        time = rq->hrtick_expire;
-        clear_thread_flag(TIF_HRTICK_RESCHED);
-        spin_unlock_irqrestore(&rq->lock, flags);
-        if (set) {
-                hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
-                if (reset && !hrtimer_active(&rq->hrtick_timer))
-                        resched_rq(rq);
-        } else
-                hrtick_clear(rq);
-}
-/*
 * High-resolution timer tick.
 * Runs from hardirq context with interrupts disabled.
 */
@@ -1115,27 +1036,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 }
 #ifdef CONFIG_SMP
-static void hotplug_hrtick_disable(int cpu)
+/*
+ * called from hardirq (IPI) context
+ */
+static void __hrtick_start(void *arg)
 {
-        struct rq *rq = cpu_rq(cpu);
+        struct rq *rq = arg;
-        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
-        rq->hrtick_flags = 0;
-        __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
-        spin_unlock_irqrestore(&rq->lock, flags);
-        hrtick_clear(rq);
+        spin_lock(&rq->lock);
+        hrtimer_restart(&rq->hrtick_timer);
+        rq->hrtick_csd_pending = 0;
+        spin_unlock(&rq->lock);
 }
-static void hotplug_hrtick_enable(int cpu)
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
 {
-        struct rq *rq = cpu_rq(cpu);
+        struct hrtimer *timer = &rq->hrtick_timer;
-        unsigned long flags;
+        ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
-        spin_lock_irqsave(&rq->lock, flags);
+        timer->expires = time;
-        __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
-        spin_unlock_irqrestore(&rq->lock, flags);
+        if (rq == this_rq()) {
+                hrtimer_restart(timer);
+        } else if (!rq->hrtick_csd_pending) {
+                __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+                rq->hrtick_csd_pending = 1;
+        }
 }
 static int
@@ -1150,16 +1081,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
        case CPU_DOWN_PREPARE_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                hotplug_hrtick_disable(cpu);
+                hrtick_clear(cpu_rq(cpu));
-                return NOTIFY_OK;
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                hotplug_hrtick_enable(cpu);
                return NOTIFY_OK;
        }
@@ -1170,46 +1092,45 @@ static void init_hrtick(void)
 {
        hotcpu_notifier(hotplug_hrtick, 0);
 }
-#endif /* CONFIG_SMP */
+#else
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
+{
+        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+}
-static void init_rq_hrtick(struct rq *rq)
+static void init_hrtick(void)
 {
-        rq->hrtick_flags = 0;
-        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        rq->hrtick_timer.function = hrtick;
-        rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
+#endif /* CONFIG_SMP */
-void hrtick_resched(void)
+static void init_rq_hrtick(struct rq *rq)
 {
-        struct rq *rq;
+#ifdef CONFIG_SMP
-        unsigned long flags;
+        rq->hrtick_csd_pending = 0;
-        if (!test_thread_flag(TIF_HRTICK_RESCHED))
+        rq->hrtick_csd.flags = 0;
-                return;
+        rq->hrtick_csd.func = __hrtick_start;
+        rq->hrtick_csd.info = rq;
+#endif
-        local_irq_save(flags);
+        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        rq = cpu_rq(smp_processor_id());
+        rq->hrtick_timer.function = hrtick;
-        hrtick_set(rq);
+        rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
-        local_irq_restore(flags);
 }
 #else
 static inline void hrtick_clear(struct rq *rq)
 {
 }
-static inline void hrtick_set(struct rq *rq)
-{
-}
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
-void hrtick_resched(void)
-{
-}
 static inline void init_hrtick(void)
 {
 }
@@ -1228,16 +1149,16 @@ static inline void init_hrtick(void)
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
-static void __resched_task(struct task_struct *p, int tif_bit)
+static void resched_task(struct task_struct *p)
 {
        int cpu;
        assert_spin_locked(&task_rq(p)->lock);
-        if (unlikely(test_tsk_thread_flag(p, tif_bit)))
+        if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
                return;
-        set_tsk_thread_flag(p, tif_bit);
+        set_tsk_thread_flag(p, TIF_NEED_RESCHED);
        cpu = task_cpu(p);
        if (cpu == smp_processor_id())
@@ -1303,10 +1224,10 @@ void wake_up_idle_cpu(int cpu)
 #endif /* CONFIG_NO_HZ */
 #else /* !CONFIG_SMP */
-static void __resched_task(struct task_struct *p, int tif_bit)
+static void resched_task(struct task_struct *p)
 {
        assert_spin_locked(&task_rq(p)->lock);
-        set_tsk_thread_flag(p, tif_bit);
+        set_tsk_need_resched(p);
 }
 #endif /* CONFIG_SMP */
@@ -1946,16 +1867,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 /*
 * wait_task_inactive - wait for a thread to unschedule.
 *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
 * The caller must ensure that the task *will* unschedule sometime soon,
 * else this function might spin for a *long* time. This function can't
 * be called with interrupts off, or it may introduce deadlock with
 * smp_call_function() if an IPI is sent by the same process we are
 * waiting to become inactive.
 */
-void wait_task_inactive(struct task_struct *p)
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
        unsigned long flags;
        int running, on_rq;
+        unsigned long ncsw;
        struct rq *rq;
        for (;;) {
@@ -1978,8 +1907,11 @@ void wait_task_inactive(struct task_struct *p)
                 * return false if the runqueue has changed and p
                 * is actually now running somewhere else!
                 */
-                while (task_running(rq, p))
+                while (task_running(rq, p)) {
+                        if (match_state && unlikely(p->state != match_state))
+                                return 0;
                        cpu_relax();
+                }
                /*
                 * Ok, time to look more closely! We need the rq
@@ -1989,9 +1921,21 @@ void wait_task_inactive(struct task_struct *p)
                rq = task_rq_lock(p, &flags);
                running = task_running(rq, p);
                on_rq = p->se.on_rq;
+                ncsw = 0;
+                if (!match_state || p->state == match_state) {
+                        ncsw = p->nivcsw + p->nvcsw;
+                        if (unlikely(!ncsw))
+                                ncsw = 1;
+                }
                task_rq_unlock(rq, &flags);
                /*
+                 * If it changed from the expected state, bail out now.
+                 */
+                if (unlikely(!ncsw))
+                        break;
+                /*
                 * Was it really running after all now that we
                 * checked with the proper locks actually held?
                 *
@@ -2023,6 +1967,8 @@ void wait_task_inactive(struct task_struct *p)
                 */
                break;
        }
+        return ncsw;
 }
 /***
@@ -2108,7 +2054,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                /* Tally up the load of all CPUs in the group */
                avg_load = 0;
-                for_each_cpu_mask(i, group->cpumask) {
+                for_each_cpu_mask_nr(i, group->cpumask) {
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
                                load = source_load(i, load_idx);
@@ -2150,7 +2096,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
        /* Traverse only the allowed CPUs */
        cpus_and(*tmp, group->cpumask, p->cpus_allowed);
-        for_each_cpu_mask(i, *tmp) {
+        for_each_cpu_mask_nr(i, *tmp) {
                load = weighted_cpuload(i);
                if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2881,7 +2827,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
        rq = task_rq_lock(p, &flags);
        if (!cpu_isset(dest_cpu, p->cpus_allowed)
-            || unlikely(cpu_is_offline(dest_cpu)))
+            || unlikely(!cpu_active(dest_cpu)))
                goto out;
        /* force the process onto the specified CPU */
@@ -3168,7 +3114,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                max_cpu_load = 0;
                min_cpu_load = ~0UL;
-                for_each_cpu_mask(i, group->cpumask) {
+                for_each_cpu_mask_nr(i, group->cpumask) {
                        struct rq *rq;
                        if (!cpu_isset(i, *cpus))
@@ -3447,7 +3393,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
        unsigned long max_load = 0;
        int i;
-        for_each_cpu_mask(i, group->cpumask) {
+        for_each_cpu_mask_nr(i, group->cpumask) {
                unsigned long wl;
                if (!cpu_isset(i, *cpus))
@@ -3849,7 +3795,7 @@ int select_nohz_load_balancer(int stop_tick)
                /*
                 * If we are going offline and still the leader, give up!
                 */
-                if (cpu_is_offline(cpu) &&
+                if (!cpu_active(cpu) &&
                    atomic_read(&nohz.load_balancer) == cpu) {
                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
                                BUG();
@@ -3989,7 +3935,7 @@ static void run_rebalance_domains(struct softirq_action *h)
                int balance_cpu;
                cpu_clear(this_cpu, cpus);
-                for_each_cpu_mask(balance_cpu, cpus) {
+                for_each_cpu_mask_nr(balance_cpu, cpus) {
                        /*
                         * If this cpu gets work to do, stop the load balancing
                         * work being done for other cpus. Next load
@@ -4125,6 +4071,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
                cpustat->nice = cputime64_add(cpustat->nice, tmp);
        else
                cpustat->user = cputime64_add(cpustat->user, tmp);
+        /* Account for user time used */
+        acct_update_integrals(p);
 }
 /*
@@ -4395,7 +4343,7 @@ asmlinkage void __sched schedule(void)
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
-        int cpu, hrtick = sched_feat(HRTICK);
+        int cpu;
 need_resched:
        preempt_disable();
@@ -4410,7 +4358,7 @@ need_resched_nonpreemptible:
        schedule_debug(prev);
-        if (hrtick)
+        if (sched_feat(HRTICK))
                hrtick_clear(rq);
        /*
@@ -4457,9 +4405,6 @@ need_resched_nonpreemptible:
        } else
                spin_unlock_irq(&rq->lock);
-        if (hrtick)
-                hrtick_set(rq);
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
@@ -5059,19 +5004,21 @@ recheck:
                        return -EPERM;
        }
+        if (user) {
 #ifdef CONFIG_RT_GROUP_SCHED
-        /*
+                /*
-         * Do not allow realtime tasks into groups that have no runtime
+                 * Do not allow realtime tasks into groups that have no runtime
-         * assigned.
+                 * assigned.
-         */
+                 */
-        if (user
+                if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
-            && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+                        return -EPERM;
-                return -EPERM;
 #endif
-        retval = security_task_setscheduler(p, policy, param);
+                retval = security_task_setscheduler(p, policy, param);
-        if (retval)
+                if (retval)
-                return retval;
+                        return retval;
+        }
        /*
         * make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
@@ -5876,7 +5823,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        struct rq *rq_dest, *rq_src;
        int ret = 0, on_rq;
-        if (unlikely(cpu_is_offline(dest_cpu)))
+        if (unlikely(!cpu_active(dest_cpu)))
                return ret;
        rq_src = cpu_rq(src_cpu);
@@ -6469,7 +6416,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
        .priority = 10
 };
-void __init migration_init(void)
+static int __init migration_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int err;
@@ -6479,7 +6426,10 @@ void __init migration_init(void)
        BUG_ON(err == NOTIFY_BAD);
        migration_call(&migration_notifier, CPU_ONLINE, cpu);
        register_cpu_notifier(&migration_notifier);
+        return err;
 }
+early_initcall(migration_init);
 #endif
 #ifdef CONFIG_SMP
@@ -6768,7 +6718,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
-        int ints[NR_CPUS], i;
+        static int __initdata ints[NR_CPUS];
+        int i;
        str = get_options(str, ARRAY_SIZE(ints), ints);
        cpus_clear(cpu_isolated_map);
@@ -6802,7 +6753,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
        cpus_clear(*covered);
-        for_each_cpu_mask(i, *span) {
+        for_each_cpu_mask_nr(i, *span) {
                struct sched_group *sg;
                int group = group_fn(i, cpu_map, &sg, tmpmask);
                int j;
@@ -6813,7 +6764,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
                cpus_clear(sg->cpumask);
                sg->__cpu_power = 0;
-                for_each_cpu_mask(j, *span) {
+                for_each_cpu_mask_nr(j, *span) {
                        if (group_fn(j, cpu_map, NULL, tmpmask) != group)
                                continue;
@@ -7013,7 +6964,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
        if (!sg)
                return;
        do {
-                for_each_cpu_mask(j, sg->cpumask) {
+                for_each_cpu_mask_nr(j, sg->cpumask) {
                        struct sched_domain *sd;
                        sd = &per_cpu(phys_domains, j);
@@ -7038,7 +6989,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
        int cpu, i;
-        for_each_cpu_mask(cpu, *cpu_map) {
+        for_each_cpu_mask_nr(cpu, *cpu_map) {
                struct sched_group **sched_group_nodes
                        = sched_group_nodes_bycpu[cpu];
@@ -7277,7 +7228,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
        /*
         * Set up domains for cpus specified by the cpu_map.
         */
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd = NULL, *p;
                SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7344,7 +7295,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #ifdef CONFIG_SCHED_SMT
        /* Set up CPU (sibling) groups */
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
                SCHED_CPUMASK_VAR(send_covered, allmasks);
@@ -7361,7 +7312,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #ifdef CONFIG_SCHED_MC
        /* Set up multi-core groups */
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                SCHED_CPUMASK_VAR(this_core_map, allmasks);
                SCHED_CPUMASK_VAR(send_covered, allmasks);
@@ -7428,7 +7379,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                        goto error;
                }
                sched_group_nodes[i] = sg;
-                for_each_cpu_mask(j, *nodemask) {
+                for_each_cpu_mask_nr(j, *nodemask) {
                        struct sched_domain *sd;
                        sd = &per_cpu(node_domains, j);
@@ -7474,21 +7425,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
        /* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd = &per_cpu(cpu_domains, i);
                init_sched_groups_power(i, sd);
        }
 #endif
 #ifdef CONFIG_SCHED_MC
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd = &per_cpu(core_domains, i);
                init_sched_groups_power(i, sd);
        }
 #endif
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd = &per_cpu(phys_domains, i);
                init_sched_groups_power(i, sd);
@@ -7508,7 +7459,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
        /* Attach the domains */
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i);
@@ -7553,18 +7504,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
 }
 /*
- * Free current domain masks.
- * Called after all cpus are attached to NULL domain.
- */
-static void free_sched_domains(void)
-{
-        ndoms_cur = 0;
-        if (doms_cur != &fallback_doms)
-                kfree(doms_cur);
-        doms_cur = &fallback_doms;
-}
-/*
 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
 * For now this just excludes isolated cpus, but could be used to
 * exclude other special cases in the future.
@@ -7603,7 +7542,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
        unregister_sched_domain_sysctl();
-        for_each_cpu_mask(i, *cpu_map)
+        for_each_cpu_mask_nr(i, *cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
        synchronize_sched();
        arch_destroy_sched_domains(cpu_map, &tmpmask);
@@ -7642,7 +7581,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 * ownership of it and will kfree it when done with it. If the caller
 * failed the kmalloc call, then it can pass in doms_new == NULL,
 * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms'.
+ * 'fallback_doms', it also forces the domains to be rebuilt.
 *
 * Call with hotplug lock held
 */
@@ -7656,12 +7595,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
-        if (doms_new == NULL) {
+        if (doms_new == NULL)
-                ndoms_new = 1;
+                ndoms_new = 0;
-                doms_new = &fallback_doms;
-                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-                dattr_new = NULL;
-        }
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
@@ -7676,6 +7611,14 @@ match1:
                ;
        }
+        if (doms_new == NULL) {
+                ndoms_cur = 0;
+                ndoms_new = 1;
+                doms_new = &fallback_doms;
+                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+                dattr_new = NULL;
+        }
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
                for (j = 0; j < ndoms_cur; j++) {
@@ -7706,17 +7649,10 @@ match2:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 int arch_reinit_sched_domains(void)
 {
-        int err;
        get_online_cpus();
-        mutex_lock(&sched_domains_mutex);
+        rebuild_sched_domains();
-        detach_destroy_domains(&cpu_online_map);
-        free_sched_domains();
-        err = arch_init_sched_domains(&cpu_online_map);
-        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
+        return 0;
-        return err;
 }
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7737,30 +7673,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+                                           char *page)
 {
        return sprintf(page, "%u\n", sched_mc_power_savings);
 }
-static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
+static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
                                            const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 0);
 }
-static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
-                   sched_mc_power_savings_store);
+                         sched_mc_power_savings_show,
+                         sched_mc_power_savings_store);
 #endif
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+                                            char *page)
 {
        return sprintf(page, "%u\n", sched_smt_power_savings);
 }
-static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
+static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
                                             const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 1);
 }
-static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+                   sched_smt_power_savings_show,
                   sched_smt_power_savings_store);
 #endif
@@ -7782,59 +7722,49 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+#ifndef CONFIG_CPUSETS
 /*
- * Force a reinitialization of the sched domains hierarchy. The domains
+ * Add online and remove offline CPUs from the scheduler domains.
- * and groups cannot be updated in place without racing with the balancing
+ * When cpusets are enabled they take over this function.
- * code, so we temporarily attach all running cpus to the NULL domain
- * which will prevent rebalancing while the sched domains are recalculated.
 */
 static int update_sched_domains(struct notifier_block *nfb,
                                unsigned long action, void *hcpu)
 {
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                partition_sched_domains(0, NULL, NULL);
+                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
+        }
+}
+#endif
+static int update_runtime(struct notifier_block *nfb,
+                                unsigned long action, void *hcpu)
+{
        int cpu = (int)(long)hcpu;
        switch (action) {
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                disable_runtime(cpu_rq(cpu));
-                /* fall-through */
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-                detach_destroy_domains(&cpu_online_map);
-                free_sched_domains();
                return NOTIFY_OK;
        case CPU_DOWN_FAILED:
        case CPU_DOWN_FAILED_FROZEN:
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
                enable_runtime(cpu_rq(cpu));
-                /* fall-through */
+                return NOTIFY_OK;
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                /*
-                 * Fall through and re-initialise the domains.
-                 */
-                break;
        default:
                return NOTIFY_DONE;
        }
-#ifndef CONFIG_CPUSETS
-        /*
-         * Create default domain partitioning if cpusets are disabled.
-         * Otherwise we let cpusets rebuild the domains based on the
-         * current setup.
-         */
-        /* The hotplug lock is already held by cpu_up/cpu_down */
-        arch_init_sched_domains(&cpu_online_map);
-#endif
-        return NOTIFY_OK;
 }
 void __init sched_init_smp(void)
@@ -7854,8 +7784,15 @@ void __init sched_init_smp(void)
                cpu_set(smp_processor_id(), non_isolated_cpus);
        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
+#ifndef CONFIG_CPUSETS
        /* XXX: Theoretical race here - CPU may be hotplugged now */
        hotcpu_notifier(update_sched_domains, 0);
+#endif
+        /* RT runtime code needs to handle some hotplug events */
+        hotcpu_notifier(update_runtime, 0);
        init_hrtick();
        /* Move init over to a non-isolated CPU */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2aa987027d6..cf2cd6ce4cb2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -878,7 +878,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 #ifdef CONFIG_SCHED_HRTICK
 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
-        int requeue = rq->curr == p;
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -899,10 +898,10 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                 * Don't schedule slices shorter than 10000ns, that just
                 * doesn't make sense. Rely on vruntime for fairness.
                 */
-                if (!requeue)
+                if (rq->curr != p)
                        delta = max(10000LL, delta);
-                hrtick_start(rq, delta, requeue);
+                hrtick_start(rq, delta);
        }
 }
 #else /* !CONFIG_SCHED_HRTICK */
@@ -1004,6 +1003,8 @@ static void yield_task_fair(struct rq *rq)
 * not idle and an idle cpu is available.  The span of cpus to
 * search starts with cpus closest then further out as needed,
 * so we always favor a closer, idle cpu.
+ * Domains may include CPUs that are not usable for migration,
+ * hence we need to mask them out (cpu_active_map)
 *
 * Returns the CPU we should wake onto.
 */
@@ -1031,7 +1032,8 @@ static int wake_idle(int cpu, struct task_struct *p)
                    || ((sd->flags & SD_WAKE_IDLE_FAR)
                        && !task_hot(p, task_rq(p)->clock, sd))) {
                        cpus_and(tmp, sd->span, p->cpus_allowed);
-                        for_each_cpu_mask(i, tmp) {
+                        cpus_and(tmp, tmp, cpu_active_map);
+                        for_each_cpu_mask_nr(i, tmp) {
                                if (idle_cpu(i)) {
                                        if (i != task_cpu(p)) {
                                                schedstat_inc(p,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 47ceac9e8552..908c04f9dad0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -240,7 +240,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
        spin_lock(&rt_b->rt_runtime_lock);
        rt_period = ktime_to_ns(rt_b->rt_period);
-        for_each_cpu_mask(i, rd->span) {
+        for_each_cpu_mask_nr(i, rd->span) {
                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
                s64 diff;
@@ -253,7 +253,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
                diff = iter->rt_runtime - iter->rt_time;
                if (diff > 0) {
-                        do_div(diff, weight);
+                        diff = div_u64((u64)diff, weight);
                        if (rt_rq->rt_runtime + diff > rt_period)
                                diff = rt_period - rt_rq->rt_runtime;
                        iter->rt_runtime -= diff;
@@ -505,7 +505,9 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+#ifdef CONFIG_SMP
                struct rq *rq = rq_of_rt_rq(rt_rq);
+#endif
                rt_rq->highest_prio = rt_se_prio(rt_se);
 #ifdef CONFIG_SMP
@@ -599,11 +601,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
-        if (rt_se->nr_cpus_allowed == 1)
+        list_add_tail(&rt_se->run_list, queue);
-                list_add(&rt_se->run_list, queue);
-        else
-                list_add_tail(&rt_se->run_list, queue);
        __set_bit(rt_se_prio(rt_se), array->bitmap);
        inc_rt_tasks(rt_se, rt_rq);
@@ -688,32 +686,34 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 * Put task to the end of the run list without the overhead of dequeue
 * followed by enqueue.
 */
-static
+static void
-void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
 {
-        struct rt_prio_array *array = &rt_rq->active;
        if (on_rt_rq(rt_se)) {
-                list_del_init(&rt_se->run_list);
+                struct rt_prio_array *array = &rt_rq->active;
-                list_add_tail(&rt_se->run_list,
+                struct list_head *queue = array->queue + rt_se_prio(rt_se);
-                              array->queue + rt_se_prio(rt_se));
+                if (head)
+                        list_move(&rt_se->run_list, queue);
+                else
+                        list_move_tail(&rt_se->run_list, queue);
        }
 }
-static void requeue_task_rt(struct rq *rq, struct task_struct *p)
+static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
 {
        struct sched_rt_entity *rt_se = &p->rt;
        struct rt_rq *rt_rq;
        for_each_sched_rt_entity(rt_se) {
                rt_rq = rt_rq_of_se(rt_se);
-                requeue_rt_entity(rt_rq, rt_se);
+                requeue_rt_entity(rt_rq, rt_se, head);
        }
 }
 static void yield_task_rt(struct rq *rq)
 {
-        requeue_task_rt(rq, rq->curr);
+        requeue_task_rt(rq, rq->curr, 0);
 }
 #ifdef CONFIG_SMP
@@ -753,6 +753,30 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
         */
        return task_cpu(p);
 }
+static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
+{
+        cpumask_t mask;
+        if (rq->curr->rt.nr_cpus_allowed == 1)
+                return;
+        if (p->rt.nr_cpus_allowed != 1
+            && cpupri_find(&rq->rd->cpupri, p, &mask))
+                return;
+        if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+                return;
+        /*
+         * There appears to be other cpus that can accept
+         * current and none to run 'p', so lets reschedule
+         * to try and push current away:
+         */
+        requeue_task_rt(rq, p, 1);
+        resched_task(rq->curr);
+}
 #endif /* CONFIG_SMP */
 /*
@@ -778,18 +802,8 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
         * to move current somewhere else, making room for our non-migratable
         * task.
         */
-        if((p->prio == rq->curr->prio)
+        if (p->prio == rq->curr->prio && !need_resched())
-           && p->rt.nr_cpus_allowed == 1
+                check_preempt_equal_prio(rq, p);
-           && rq->curr->rt.nr_cpus_allowed != 1) {
-                cpumask_t mask;
-                if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
-                        /*
-                         * There appears to be other cpus that can accept
-                         * current, so lets reschedule to try and push it away
-                         */
-                        resched_task(rq->curr);
-        }
 #endif
 }
@@ -922,6 +936,13 @@ static int find_lowest_rq(struct task_struct *task)
                return -1; /* No targets found */
        /*
+         * Only consider CPUs that are usable for migration.
+         * I guess we might want to change cpupri_find() to ignore those
+         * in the first place.
+         */
+        cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+        /*
         * At this point we have built a mask of cpus representing the
         * lowest priority tasks in the system.  Now we want to elect
         * the best one based on our affinity and topology.
@@ -1107,7 +1128,7 @@ static int pull_rt_task(struct rq *this_rq)
        next = pick_next_task_rt(this_rq);
-        for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
+        for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
@@ -1415,7 +1436,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
         * on the queue:
         */
        if (p->rt.run_list.prev != p->rt.run_list.next) {
-                requeue_task_rt(rq, p);
+                requeue_task_rt(rq, p, 0);
                set_tsk_need_resched(p);
        }
 }
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index aaaeae8244e7..94a62c0d4ade 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -212,9 +212,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
        waiter.up = 0;
        for (;;) {
-                if (state == TASK_INTERRUPTIBLE && signal_pending(task))
+                if (signal_pending_state(state, task))
-                        goto interrupted;
-                if (state == TASK_KILLABLE && fatal_signal_pending(task))
                        goto interrupted;
                if (timeout <= 0)
                        goto timed_out;
diff --git a/kernel/signal.c b/kernel/signal.c
index 6c0958e52ea7..954f77d7e3bc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/signalfd.h>
+#include <linux/tracehook.h>
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
@@ -39,24 +40,21 @@
 static struct kmem_cache *sigqueue_cachep;
-static int __sig_ignored(struct task_struct *t, int sig)
+static void __user *sig_handler(struct task_struct *t, int sig)
 {
-        void __user *handler;
+        return t->sighand->action[sig - 1].sa.sa_handler;
+}
+static int sig_handler_ignored(void __user *handler, int sig)
+{
        /* Is it explicitly or implicitly ignored? */
-        handler = t->sighand->action[sig - 1].sa.sa_handler;
        return handler == SIG_IGN ||
                (handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 static int sig_ignored(struct task_struct *t, int sig)
 {
-        /*
+        void __user *handler;
-         * Tracers always want to know about signals..
-         */
-        if (t->ptrace & PT_PTRACED)
-                return 0;
        /*
         * Blocked signals are never ignored, since the
@@ -66,7 +64,14 @@ static int sig_ignored(struct task_struct *t, int sig)
        if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
                return 0;
-        return __sig_ignored(t, sig);
+        handler = sig_handler(t, sig);
+        if (!sig_handler_ignored(handler, sig))
+                return 0;
+        /*
+         * Tracers may want to know about even ignored signals.
+         */
+        return !tracehook_consider_ignored_signal(t, sig, handler);
 }
 /*
@@ -129,7 +134,9 @@ void recalc_sigpending_and_wake(struct task_struct *t)
 void recalc_sigpending(void)
 {
-        if (!recalc_sigpending_tsk(current) && !freezing(current))
+        if (unlikely(tracehook_force_sigpending()))
+                set_thread_flag(TIF_SIGPENDING);
+        else if (!recalc_sigpending_tsk(current) && !freezing(current))
                clear_thread_flag(TIF_SIGPENDING);
 }
@@ -295,12 +302,12 @@ flush_signal_handlers(struct task_struct *t, int force_default)
 int unhandled_signal(struct task_struct *tsk, int sig)
 {
+        void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
        if (is_global_init(tsk))
                return 1;
-        if (tsk->ptrace & PT_PTRACED)
+        if (handler != SIG_IGN && handler != SIG_DFL)
                return 0;
-        return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
+        return !tracehook_consider_fatal_signal(tsk, sig, handler);
-                (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
 }
@@ -338,13 +345,9 @@ unblock_all_signals(void)
        spin_unlock_irqrestore(&current->sighand->siglock, flags);
 }
-static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 {
        struct sigqueue *q, *first = NULL;
-        int still_pending = 0;
-        if (unlikely(!sigismember(&list->signal, sig)))
-                return 0;
        /*
         * Collect the siginfo appropriate to this signal.  Check if
@@ -352,33 +355,30 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
        */
        list_for_each_entry(q, &list->list, list) {
                if (q->info.si_signo == sig) {
-                        if (first) {
+                        if (first)
-                                still_pending = 1;
+                                goto still_pending;
-                                break;
-                        }
                        first = q;
                }
        }
+        sigdelset(&list->signal, sig);
        if (first) {
+still_pending:
                list_del_init(&first->list);
                copy_siginfo(info, &first->info);
                __sigqueue_free(first);
-                if (!still_pending)
-                        sigdelset(&list->signal, sig);
        } else {
                /* Ok, it wasn't in the queue.  This must be
                   a fast-pathed signal or we must have been
                   out of queue space.  So zero out the info.
                 */
-                sigdelset(&list->signal, sig);
                info->si_signo = sig;
                info->si_errno = 0;
                info->si_code = 0;
                info->si_pid = 0;
                info->si_uid = 0;
        }
-        return 1;
 }
 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
@@ -396,8 +396,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
                        }
                }
-                if (!collect_signal(sig, pending, info))
+                collect_signal(sig, pending, info);
-                        sig = 0;
        }
        return sig;
@@ -462,8 +461,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                 * is to alert stop-signal processing code when another
                 * processor has come along and cleared the flag.
                 */
-                if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
+                tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
-                        tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
        }
        if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
                /*
@@ -600,9 +598,6 @@ static int check_kill_permission(int sig, struct siginfo *info,
        return security_task_kill(t, info, sig, 0);
 }
-/* forward decl */
-static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
 /*
 * Handle magic process-wide effects of stop/continue signals. Unlike
 * the signal actions, these happen immediately at signal-generation
@@ -765,7 +760,8 @@ static void complete_signal(int sig, struct task_struct *p, int group)
        if (sig_fatal(p, sig) &&
            !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
            !sigismember(&t->real_blocked, sig) &&
-            (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
+            (sig == SIGKILL ||
+             !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
                /*
                 * This signal will be fatal to the whole group.
                 */
@@ -1125,7 +1121,7 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
 * is probably wrong.  Should make it like BSD or SYSV.
 */
-static int kill_something_info(int sig, struct siginfo *info, int pid)
+static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
 {
        int ret;
@@ -1237,17 +1233,6 @@ int kill_pid(struct pid *pid, int sig, int priv)
 }
 EXPORT_SYMBOL(kill_pid);
-int
-kill_proc(pid_t pid, int sig, int priv)
-{
-        int ret;
-        rcu_read_lock();
-        ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
-        rcu_read_unlock();
-        return ret;
-}
 /*
 * These functions support sending signals using preallocated sigqueue
 * structures.  This is needed "because realtime applications cannot
@@ -1343,9 +1328,11 @@ static inline void __wake_up_parent(struct task_struct *p,
 /*
 * Let a parent know about the death of a child.
 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
 */
+int do_notify_parent(struct task_struct *tsk, int sig)
-void do_notify_parent(struct task_struct *tsk, int sig)
 {
        struct siginfo info;
        unsigned long flags;
@@ -1379,10 +1366,9 @@ void do_notify_parent(struct task_struct *tsk, int sig)
        info.si_uid = tsk->uid;
-        /* FIXME: find out whether or not this is supposed to be c*time. */
+        info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
-        info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime,
                                                       tsk->signal->utime));
-        info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime,
+        info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
                                                       tsk->signal->stime));
        info.si_status = tsk->exit_code & 0x7f;
@@ -1417,12 +1403,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
                 */
                tsk->exit_signal = -1;
                if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
-                        sig = 0;
+                        sig = -1;
        }
        if (valid_signal(sig) && sig > 0)
                __group_send_sig_info(sig, &info, tsk->parent);
        __wake_up_parent(tsk, tsk->parent);
        spin_unlock_irqrestore(&psig->siglock, flags);
+        return sig;
 }
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
@@ -1450,9 +1438,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
        info.si_uid = tsk->uid;
-        /* FIXME: find out whether or not this is supposed to be c*time. */
+        info.si_utime = cputime_to_clock_t(tsk->utime);
-        info.si_utime = cputime_to_jiffies(tsk->utime);
+        info.si_stime = cputime_to_clock_t(tsk->stime);
-        info.si_stime = cputime_to_jiffies(tsk->stime);
        info.si_code = why;
        switch (why) {
@@ -1491,10 +1478,10 @@ static inline int may_ptrace_stop(void)
         * is a deadlock situation, and pointless because our tracer
         * is dead so don't allow us to stop.
         * If SIGKILL was already sent before the caller unlocked
-         * ->siglock we must see ->core_waiters != 0. Otherwise it
+         * ->siglock we must see ->core_state != NULL. Otherwise it
         * is safe to enter schedule().
         */
-        if (unlikely(current->mm->core_waiters) &&
+        if (unlikely(current->mm->core_state) &&
            unlikely(current->mm == current->parent->mm))
                return 0;
@@ -1507,9 +1494,8 @@ static inline int may_ptrace_stop(void)
 */
 static int sigkill_pending(struct task_struct *tsk)
 {
-        return ((sigismember(&tsk->pending.signal, SIGKILL) ||
+        return  sigismember(&tsk->pending.signal, SIGKILL) ||
-                 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) &&
+                sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
-                !unlikely(sigismember(&tsk->blocked, SIGKILL)));
 }
 /*
@@ -1525,8 +1511,6 @@ static int sigkill_pending(struct task_struct *tsk)
 */
 static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 {
-        int killed = 0;
        if (arch_ptrace_stop_needed(exit_code, info)) {
                /*
                 * The arch code has something special to do before a
@@ -1542,7 +1526,8 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
                spin_unlock_irq(&current->sighand->siglock);
                arch_ptrace_stop(exit_code, info);
                spin_lock_irq(&current->sighand->siglock);
-                killed = sigkill_pending(current);
+                if (sigkill_pending(current))
+                        return;
        }
        /*
@@ -1559,7 +1544,7 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
        __set_current_state(TASK_TRACED);
        spin_unlock_irq(&current->sighand->siglock);
        read_lock(&tasklist_lock);
-        if (!unlikely(killed) && may_ptrace_stop()) {
+        if (may_ptrace_stop()) {
                do_notify_parent_cldstop(current, CLD_TRAPPED);
                read_unlock(&tasklist_lock);
                schedule();
@@ -1623,7 +1608,7 @@ finish_stop(int stop_count)
         * a group stop in progress and we are the last to stop,
         * report to the parent.  When ptraced, every thread reports itself.
         */
-        if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
+        if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(current, CLD_STOPPED);
                read_unlock(&tasklist_lock);
@@ -1658,8 +1643,7 @@ static int do_signal_stop(int signr)
        } else {
                struct task_struct *t;
-                if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE))
+                if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
-                                         != SIGNAL_STOP_DEQUEUED) ||
                    unlikely(signal_group_exit(sig)))
                        return 0;
                /*
@@ -1760,6 +1744,9 @@ relock:
                signal->flags &= ~SIGNAL_CLD_MASK;
                spin_unlock_irq(&sighand->siglock);
+                if (unlikely(!tracehook_notify_jctl(1, why)))
+                        goto relock;
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(current->group_leader, why);
                read_unlock(&tasklist_lock);
@@ -1773,17 +1760,33 @@ relock:
                    do_signal_stop(0))
                        goto relock;
-                signr = dequeue_signal(current, &current->blocked, info);
+                /*
-                if (!signr)
+                 * Tracing can induce an artifical signal and choose sigaction.
-                        break; /* will return 0 */
+                 * The return value in @signr determines the default action,
+                 * but @info->si_signo is the signal number we will report.
+                 */
+                signr = tracehook_get_signal(current, regs, info, return_ka);
+                if (unlikely(signr < 0))
+                        goto relock;
+                if (unlikely(signr != 0))
+                        ka = return_ka;
+                else {
+                        signr = dequeue_signal(current, &current->blocked,
+                                               info);
-                if (signr != SIGKILL) {
-                        signr = ptrace_signal(signr, info, regs, cookie);
                        if (!signr)
-                                continue;
+                                break; /* will return 0 */
+                        if (signr != SIGKILL) {
+                                signr = ptrace_signal(signr, info,
+                                                      regs, cookie);
+                                if (!signr)
+                                        continue;
+                        }
+                        ka = &sighand->action[signr-1];
                }
-                ka = &sighand->action[signr-1];
                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
@@ -1831,7 +1834,7 @@ relock:
                                spin_lock_irq(&sighand->siglock);
                        }
-                        if (likely(do_signal_stop(signr))) {
+                        if (likely(do_signal_stop(info->si_signo))) {
                                /* It released the siglock.  */
                                goto relock;
                        }
@@ -1852,7 +1855,7 @@ relock:
                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
-                                print_fatal_signal(regs, signr);
+                                print_fatal_signal(regs, info->si_signo);
                        /*
                         * If it was able to dump core, this kills all
                         * other threads in the group and synchronizes with
@@ -1861,13 +1864,13 @@ relock:
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
-                        do_coredump((long)signr, signr, regs);
+                        do_coredump(info->si_signo, info->si_signo, regs);
                }
                /*
                 * Death signals, no core dump.
                 */
-                do_group_exit(signr);
+                do_group_exit(info->si_signo);
                /* NOTREACHED */
        }
        spin_unlock_irq(&sighand->siglock);
@@ -1909,7 +1912,7 @@ void exit_signals(struct task_struct *tsk)
 out:
        spin_unlock_irq(&tsk->sighand->siglock);
-        if (unlikely(group_stop)) {
+        if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(tsk, CLD_STOPPED);
                read_unlock(&tasklist_lock);
@@ -1920,8 +1923,6 @@ EXPORT_SYMBOL(recalc_sigpending);
 EXPORT_SYMBOL_GPL(dequeue_signal);
 EXPORT_SYMBOL(flush_signals);
 EXPORT_SYMBOL(force_sig);
-EXPORT_SYMBOL(kill_proc);
-EXPORT_SYMBOL(ptrace_notify);
 EXPORT_SYMBOL(send_sig);
 EXPORT_SYMBOL(send_sig_info);
 EXPORT_SYMBOL(sigprocmask);
@@ -2196,7 +2197,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
 }
 asmlinkage long
-sys_kill(int pid, int sig)
+sys_kill(pid_t pid, int sig)
 {
        struct siginfo info;
@@ -2209,7 +2210,7 @@ sys_kill(int pid, int sig)
        return kill_something_info(sig, &info, pid);
 }
-static int do_tkill(int tgid, int pid, int sig)
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
 {
        int error;
        struct siginfo info;
@@ -2255,7 +2256,7 @@ static int do_tkill(int tgid, int pid, int sig)
 *  exists but it's not belonging to the target process anymore. This
 *  method solves the problem of threads exiting and PIDs getting reused.
 */
-asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig)
 {
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
@@ -2268,7 +2269,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
 *  Send a signal to only one task, even if it's a CLONE_THREAD task.
 */
 asmlinkage long
-sys_tkill(int pid, int sig)
+sys_tkill(pid_t pid, int sig)
 {
        /* This is only valid for single tasks */
        if (pid <= 0)
@@ -2278,7 +2279,7 @@ sys_tkill(int pid, int sig)
 }
 asmlinkage long
-sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
+sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo)
 {
        siginfo_t info;
@@ -2325,7 +2326,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
                 *   (for example, SIGCHLD), shall cause the pending signal to
                 *   be discarded, whether or not it is blocked"
                 */
-                if (__sig_ignored(t, sig)) {
+                if (sig_handler_ignored(sig_handler(t, sig), sig)) {
                        sigemptyset(&mask);
                        sigaddset(&mask, sig);
                        rm_from_queue_full(&mask, &t->signal->shared_pending);
diff --git a/kernel/smp.c b/kernel/smp.c
index 462c785ca1ee..96fc7c0edc59 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,7 +33,7 @@ struct call_single_queue {
        spinlock_t lock;
 };
-void __cpuinit init_call_single_data(void)
+static int __cpuinit init_call_single_data(void)
 {
        int i;
@@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void)
                spin_lock_init(&q->lock);
                INIT_LIST_HEAD(&q->list);
        }
+        return 0;
 }
+early_initcall(init_call_single_data);
 static void csd_flag_wait(struct call_single_data *data)
 {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 81e2fe0f983a..c506f266a6b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -286,7 +286,7 @@ void irq_exit(void)
 #ifdef CONFIG_NO_HZ
        /* Make sure that timer wheel updates are propagated */
        if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
-                tick_nohz_stop_sched_tick();
+                tick_nohz_stop_sched_tick(0);
        rcu_irq_exit();
 #endif
        preempt_enable_no_resched();
@@ -630,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
-__init int spawn_ksoftirqd(void)
+static __init int spawn_ksoftirqd(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
@@ -640,6 +640,7 @@ __init int spawn_ksoftirqd(void)
        register_cpu_notifier(&cpu_nfb);
        return 0;
 }
+early_initcall(spawn_ksoftirqd);
 #ifdef CONFIG_SMP
 /*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index a272d78185eb..b75b492fbfcf 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -13,6 +13,7 @@
 #include <linux/delay.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/lockdep.h>
 #include <linux/notifier.h>
 #include <linux/module.h>
@@ -25,7 +26,22 @@ static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 static int __read_mostly did_panic;
-unsigned long __read_mostly softlockup_thresh = 60;
+int __read_mostly softlockup_thresh = 60;
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * soft-lockup occurs:
+ */
+unsigned int __read_mostly softlockup_panic =
+                                CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+static int __init softlockup_panic_setup(char *str)
+{
+        softlockup_panic = simple_strtoul(str, NULL, 0);
+        return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
 static int
 softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -84,6 +100,14 @@ void softlockup_tick(void)
        struct pt_regs *regs = get_irq_regs();
        unsigned long now;
+        /* Is detection switched off? */
+        if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) {
+                /* Be sure we don't false trigger if switched back on */
+                if (touch_timestamp)
+                        per_cpu(touch_timestamp, this_cpu) = 0;
+                return;
+        }
        if (touch_timestamp == 0) {
                __touch_softlockup_watchdog();
                return;
@@ -92,11 +116,8 @@ void softlockup_tick(void)
        print_timestamp = per_cpu(print_timestamp, this_cpu);
        /* report at most once a second */
-        if ((print_timestamp >= touch_timestamp &&
+        if (print_timestamp == touch_timestamp || did_panic)
-                        print_timestamp < (touch_timestamp + 1)) ||
-                        did_panic || !per_cpu(watchdog_task, this_cpu)) {
                return;
-        }
        /* do not print during early bootup: */
        if (unlikely(system_state != SYSTEM_RUNNING)) {
@@ -106,8 +127,11 @@ void softlockup_tick(void)
        now = get_timestamp(this_cpu);
-        /* Wake up the high-prio watchdog task every second: */
+        /*
-        if (now > (touch_timestamp + 1))
+         * Wake up the high-prio watchdog task twice per
+         * threshold timespan.
+         */
+        if (now > touch_timestamp + softlockup_thresh/2)
                wake_up_process(per_cpu(watchdog_task, this_cpu));
        /* Warn about unreasonable delays: */
@@ -121,11 +145,15 @@ void softlockup_tick(void)
                        this_cpu, now - touch_timestamp,
                        current->comm, task_pid_nr(current));
        print_modules();
+        print_irqtrace_events(current);
        if (regs)
                show_regs(regs);
        else
                dump_stack();
        spin_unlock(&print_lock);
+        if (softlockup_panic)
+                panic("softlockup: hung tasks");
 }
 /*
@@ -178,6 +206,9 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
        t->last_switch_timestamp = now;
        touch_nmi_watchdog();
+        if (softlockup_panic)
+                panic("softlockup: blocked tasks");
 }
 /*
@@ -307,14 +338,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
-__init void spawn_softlockup_task(void)
+static int __initdata nosoftlockup;
+static int __init nosoftlockup_setup(char *str)
+{
+        nosoftlockup = 1;
+        return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+static int __init spawn_softlockup_task(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
-        int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+        int err;
+        if (nosoftlockup)
+                return 0;
-        BUG_ON(err == NOTIFY_BAD);
+        err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+        if (err == NOTIFY_BAD) {
+                BUG();
+                return 1;
+        }
        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
        register_cpu_notifier(&cpu_nfb);
        atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+        return 0;
 }
+early_initcall(spawn_softlockup_task);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ba9b2054ecbd..e446c7c7d6a9 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,4 +1,4 @@
-/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
 * GPL v2 and any later version.
 */
 #include <linux/cpu.h>
@@ -13,203 +13,178 @@
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
-/* Since we effect priority and affinity (both of which are visible
+/* This controls the threads on each CPU. */
- * to, and settable by outside processes) we do indirection via a
- * kthread. */
-/* Thread to stop each CPU in user context. */
 enum stopmachine_state {
-        STOPMACHINE_WAIT,
+        /* Dummy starting state for thread. */
+        STOPMACHINE_NONE,
+        /* Awaiting everyone to be scheduled. */
        STOPMACHINE_PREPARE,
+        /* Disable interrupts. */
        STOPMACHINE_DISABLE_IRQ,
+        /* Run the function */
+        STOPMACHINE_RUN,
+        /* Exit */
        STOPMACHINE_EXIT,
 };
+static enum stopmachine_state state;
-static enum stopmachine_state stopmachine_state;
+struct stop_machine_data {
-static unsigned int stopmachine_num_threads;
+        int (*fn)(void *);
-static atomic_t stopmachine_thread_ack;
+        void *data;
+        int fnret;
-static int stopmachine(void *cpu)
+};
-{
-        int irqs_disabled = 0;
-        int prepared = 0;
-        set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
-        /* Ack: we are alive */
-        smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
-        atomic_inc(&stopmachine_thread_ack);
-        /* Simple state machine */
-        while (stopmachine_state != STOPMACHINE_EXIT) {
-                if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 
-                    && !irqs_disabled) {
-                        local_irq_disable();
-                        hard_irq_disable();
-                        irqs_disabled = 1;
-                        /* Ack: irqs disabled. */
-                        smp_mb(); /* Must read state first. */
-                        atomic_inc(&stopmachine_thread_ack);
-                } else if (stopmachine_state == STOPMACHINE_PREPARE
-                           && !prepared) {
-                        /* Everyone is in place, hold CPU. */
-                        preempt_disable();
-                        prepared = 1;
-                        smp_mb(); /* Must read state first. */
-                        atomic_inc(&stopmachine_thread_ack);
-                }
-                /* Yield in first stage: migration threads need to
-                 * help our sisters onto their CPUs. */
-                if (!prepared && !irqs_disabled)
-                        yield();
-                cpu_relax();
-        }
-        /* Ack: we are exiting. */
-        smp_mb(); /* Must read state first. */
-        atomic_inc(&stopmachine_thread_ack);
-        if (irqs_disabled)
-                local_irq_enable();
-        if (prepared)
-                preempt_enable();
-        return 0;
+/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-}
+static unsigned int num_threads;
+static atomic_t thread_ack;
+static struct completion finished;
+static DEFINE_MUTEX(lock);
-/* Change the thread state */
+static void set_state(enum stopmachine_state newstate)
-static void stopmachine_set_state(enum stopmachine_state state)
 {
-        atomic_set(&stopmachine_thread_ack, 0);
+        /* Reset ack counter. */
+        atomic_set(&thread_ack, num_threads);
        smp_wmb();
-        stopmachine_state = state;
+        state = newstate;
-        while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-                cpu_relax();
 }
-static int stop_machine(void)
+/* Last one to ack a state moves to the next state. */
+static void ack_state(void)
 {
-        int i, ret = 0;
+        if (atomic_dec_and_test(&thread_ack)) {
+                /* If we're the last one to ack the EXIT, we're finished. */
-        atomic_set(&stopmachine_thread_ack, 0);
+                if (state == STOPMACHINE_EXIT)
-        stopmachine_num_threads = 0;
+                        complete(&finished);
-        stopmachine_state = STOPMACHINE_WAIT;
+                else
+                        set_state(state + 1);
-        for_each_online_cpu(i) {
-                if (i == raw_smp_processor_id())
-                        continue;
-                ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
-                if (ret < 0)
-                        break;
-                stopmachine_num_threads++;
-        }
-        /* Wait for them all to come to life. */
-        while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) {
-                yield();
-                cpu_relax();
        }
+}
-        /* If some failed, kill them all. */
+/* This is the actual thread which stops the CPU.  It exits by itself rather
-        if (ret < 0) {
+ * than waiting for kthread_stop(), because it's easier for hotplug CPU. */
-                stopmachine_set_state(STOPMACHINE_EXIT);
+static int stop_cpu(struct stop_machine_data *smdata)
-                return ret;
+{
-        }
+        enum stopmachine_state curstate = STOPMACHINE_NONE;
+        int uninitialized_var(ret);
-        /* Now they are all started, make them hold the CPUs, ready. */
+        /* Simple state machine */
-        preempt_disable();
+        do {
-        stopmachine_set_state(STOPMACHINE_PREPARE);
+                /* Chill out and ensure we re-read stopmachine_state. */
+                cpu_relax();
+                if (state != curstate) {
+                        curstate = state;
+                        switch (curstate) {
+                        case STOPMACHINE_DISABLE_IRQ:
+                                local_irq_disable();
+                                hard_irq_disable();
+                                break;
+                        case STOPMACHINE_RUN:
+                                /* |= allows error detection if functions on
+                                 * multiple CPUs. */
+                                smdata->fnret |= smdata->fn(smdata->data);
+                                break;
+                        default:
+                                break;
+                        }
+                        ack_state();
+                }
+        } while (curstate != STOPMACHINE_EXIT);
-        /* Make them disable irqs. */
+        local_irq_enable();
-        local_irq_disable();
+        do_exit(0);
-        hard_irq_disable();
+}
-        stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
+/* Callback for CPUs which aren't supposed to do anything. */
+static int chill(void *unused)
+{
        return 0;
 }
-static void restart_machine(void)
+int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
 {
-        stopmachine_set_state(STOPMACHINE_EXIT);
+        int i, err;
-        local_irq_enable();
+        struct stop_machine_data active, idle;
-        preempt_enable_no_resched();
+        struct task_struct **threads;
-}
+        active.fn = fn;
+        active.data = data;
+        active.fnret = 0;
+        idle.fn = chill;
+        idle.data = NULL;
+        /* This could be too big for stack on large machines. */
+        threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
+        if (!threads)
+                return -ENOMEM;
+        /* Set up initial state. */
+        mutex_lock(&lock);
+        init_completion(&finished);
+        num_threads = num_online_cpus();
+        set_state(STOPMACHINE_PREPARE);
-struct stop_machine_data {
+        for_each_online_cpu(i) {
-        int (*fn)(void *);
+                struct stop_machine_data *smdata = &idle;
-        void *data;
+                struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-        struct completion done;
-};
-static int do_stop(void *_smdata)
+                if (!cpus) {
-{
+                        if (i == first_cpu(cpu_online_map))
-        struct stop_machine_data *smdata = _smdata;
+                                smdata = &active;
-        int ret;
+                } else {
+                        if (cpu_isset(i, *cpus))
+                                smdata = &active;
+                }
-        ret = stop_machine();
+                threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u",
-        if (ret == 0) {
+                                            i);
-                ret = smdata->fn(smdata->data);
+                if (IS_ERR(threads[i])) {
-                restart_machine();
+                        err = PTR_ERR(threads[i]);
-        }
+                        threads[i] = NULL;
+                        goto kill_threads;
+                }
-        /* We're done: you can kthread_stop us now */
+                /* Place it onto correct cpu. */
-        complete(&smdata->done);
+                kthread_bind(threads[i], i);
-        /* Wait for kthread_stop */
+                /* Make it highest prio. */
-        set_current_state(TASK_INTERRUPTIBLE);
+                if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, &param))
-        while (!kthread_should_stop()) {
+                        BUG();
-                schedule();
-                set_current_state(TASK_INTERRUPTIBLE);
        }
-        __set_current_state(TASK_RUNNING);
-        return ret;
-}
-struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
+        /* We've created all the threads.  Wake them all: hold this CPU so one
-                                       unsigned int cpu)
+         * doesn't hit this CPU until we're ready. */
-{
+        get_cpu();
-        static DEFINE_MUTEX(stopmachine_mutex);
+        for_each_online_cpu(i)
-        struct stop_machine_data smdata;
+                wake_up_process(threads[i]);
-        struct task_struct *p;
-        smdata.fn = fn;
+        /* This will release the thread on our CPU. */
-        smdata.data = data;
+        put_cpu();
-        init_completion(&smdata.done);
+        wait_for_completion(&finished);
+        mutex_unlock(&lock);
-        mutex_lock(&stopmachine_mutex);
+        kfree(threads);
-        /* If they don't care which CPU fn runs on, bind to any online one. */
+        return active.fnret;
-        if (cpu == NR_CPUS)
-                cpu = raw_smp_processor_id();
-        p = kthread_create(do_stop, &smdata, "kstopmachine");
+kill_threads:
-        if (!IS_ERR(p)) {
+        for_each_online_cpu(i)
-                struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+                if (threads[i])
+                        kthread_stop(threads[i]);
+        mutex_unlock(&lock);
-                /* One high-prio thread per cpu.  We'll do this one. */
+        kfree(threads);
-                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+        return err;
-                kthread_bind(p, cpu);
-                wake_up_process(p);
-                wait_for_completion(&smdata.done);
-        }
-        mutex_unlock(&stopmachine_mutex);
-        return p;
 }
-int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
+int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
 {
-        struct task_struct *p;
        int ret;
        /* No CPUs can come up or down during this. */
        get_online_cpus();
-        p = __stop_machine_run(fn, data, cpu);
+        ret = __stop_machine(fn, data, cpus);
-        if (!IS_ERR(p))
-                ret = kthread_stop(p);
-        else
-                ret = PTR_ERR(p);
        put_online_cpus();
        return ret;
 }
-EXPORT_SYMBOL_GPL(stop_machine_run);
+EXPORT_SYMBOL_GPL(stop_machine);
diff --git a/kernel/sys.c b/kernel/sys.c
index 14e97282eb6c..c01858090a98 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -301,26 +301,6 @@ void kernel_restart(char *cmd)
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
-/**
- *      kernel_kexec - reboot the system
- *
- *      Move into place and start executing a preloaded standalone
- *      executable.  If nothing was preloaded return an error.
- */
-static void kernel_kexec(void)
-{
-#ifdef CONFIG_KEXEC
-        struct kimage *image;
-        image = xchg(&kexec_image, NULL);
-        if (!image)
-                return;
-        kernel_restart_prepare(NULL);
-        printk(KERN_EMERG "Starting new kernel\n");
-        machine_shutdown();
-        machine_kexec(image);
-#endif
-}
 static void kernel_shutdown_prepare(enum system_states state)
 {
        blocking_notifier_call_chain(&reboot_notifier_list,
@@ -425,10 +405,15 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
                kernel_restart(buffer);
                break;
+#ifdef CONFIG_KEXEC
        case LINUX_REBOOT_CMD_KEXEC:
-                kernel_kexec();
+                {
-                unlock_kernel();
+                        int ret;
-                return -EINVAL;
+                        ret = kernel_kexec();
+                        unlock_kernel();
+                        return ret;
+                }
+#endif
 #ifdef CONFIG_HIBERNATION
        case LINUX_REBOOT_CMD_SW_SUSPEND:
@@ -1343,8 +1328,6 @@ EXPORT_SYMBOL(in_egroup_p);
 DECLARE_RWSEM(uts_sem);
-EXPORT_SYMBOL(uts_sem);
 asmlinkage long sys_newuname(struct new_utsname __user * name)
 {
        int errno = 0;
@@ -1795,7 +1778,7 @@ int orderly_poweroff(bool force)
                goto out;
        }
-        info = call_usermodehelper_setup(argv[0], argv, envp);
+        info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
        if (info == NULL) {
                argv_free(argv);
                goto out;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5b9b467de070..08d6e1bb99ac 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,6 +31,7 @@ cond_syscall(sys_socketpair);
 cond_syscall(sys_bind);
 cond_syscall(sys_listen);
 cond_syscall(sys_accept);
+cond_syscall(sys_paccept);
 cond_syscall(sys_connect);
 cond_syscall(sys_getsockname);
 cond_syscall(sys_getpeername);
@@ -56,9 +57,11 @@ cond_syscall(compat_sys_set_robust_list);
 cond_syscall(sys_get_robust_list);
 cond_syscall(compat_sys_get_robust_list);
 cond_syscall(sys_epoll_create);
+cond_syscall(sys_epoll_create1);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
 cond_syscall(sys_epoll_pwait);
+cond_syscall(compat_sys_epoll_pwait);
 cond_syscall(sys_semget);
 cond_syscall(sys_semop);
 cond_syscall(sys_semtimedop);
@@ -94,6 +97,7 @@ cond_syscall(sys_keyctl);
 cond_syscall(compat_sys_keyctl);
 cond_syscall(compat_sys_socketcall);
 cond_syscall(sys_inotify_init);
+cond_syscall(sys_inotify_init1);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
 cond_syscall(sys_migrate_pages);
@@ -154,10 +158,13 @@ cond_syscall(sys_ioprio_get);
 /* New file descriptors */
 cond_syscall(sys_signalfd);
+cond_syscall(sys_signalfd4);
 cond_syscall(compat_sys_signalfd);
+cond_syscall(compat_sys_signalfd4);
 cond_syscall(sys_timerfd_create);
 cond_syscall(sys_timerfd_settime);
 cond_syscall(sys_timerfd_gettime);
 cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
+cond_syscall(sys_eventfd2);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6b16e16428d8..fe4713347275 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -43,6 +43,7 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/vmstat.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
@@ -80,7 +81,6 @@ extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
-extern int sysctl_stat_interval;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifdef CONFIG_RCU_TORTURE_TEST
@@ -88,12 +88,13 @@ extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 /* Constants used for minimum and  maximum */
-#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
 static int one = 1;
 #endif
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 static int sixty = 60;
+static int neg_one = -1;
 #endif
 #ifdef CONFIG_MMU
@@ -110,7 +111,7 @@ static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 extern char modprobe_path[];
 #endif
 #ifdef CONFIG_CHR_DEV_SG
@@ -159,12 +160,13 @@ static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
        .ctl_table = root_table,
-        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
+        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
        .root = &sysctl_table_root,
+        .set = &sysctl_table_root.default_set,
 };
 static struct ctl_table_root sysctl_table_root = {
        .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
-        .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+        .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
 };
 static struct ctl_table kern_table[];
@@ -475,7 +477,7 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &ftrace_enable_sysctl,
        },
 #endif
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
        {
                .ctl_name       = KERN_MODPROBE,
                .procname       = "modprobe",
@@ -623,7 +625,7 @@ static struct ctl_table kern_table[] = {
        {
                .ctl_name       = KERN_PRINTK_RATELIMIT,
                .procname       = "printk_ratelimit",
-                .data           = &printk_ratelimit_jiffies,
+                .data           = &printk_ratelimit_state.interval,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec_jiffies,
@@ -632,7 +634,7 @@ static struct ctl_table kern_table[] = {
        {
                .ctl_name       = KERN_PRINTK_RATELIMIT_BURST,
                .procname       = "printk_ratelimit_burst",
-                .data           = &printk_ratelimit_burst,
+                .data           = &printk_ratelimit_state.burst,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
@@ -739,13 +741,24 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_DETECT_SOFTLOCKUP
        {
                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "softlockup_panic",
+                .data           = &softlockup_panic,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "softlockup_thresh",
                .data           = &softlockup_thresh,
-                .maxlen         = sizeof(unsigned long),
+                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = &proc_dointvec_minmax,
                .strategy       = &sysctl_intvec,
-                .extra1         = &one,
+                .extra1         = &neg_one,
                .extra2         = &sixty,
        },
        {
@@ -947,7 +960,7 @@ static struct ctl_table vm_table[] = {
 #ifdef CONFIG_HUGETLB_PAGE
         {
                .procname       = "nr_hugepages",
-                .data           = &max_huge_pages,
+                .data           = NULL,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = &hugetlb_sysctl_handler,
@@ -973,10 +986,12 @@ static struct ctl_table vm_table[] = {
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nr_overcommit_hugepages",
-                .data           = &sysctl_overcommit_huge_pages,
+                .data           = NULL,
-                .maxlen         = sizeof(sysctl_overcommit_huge_pages),
+                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = &hugetlb_overcommit_handler,
+                .extra1         = (void *)&hugetlb_zero,
+                .extra2         = (void *)&hugetlb_infinity,
        },
 #endif
        {
@@ -1372,6 +1387,9 @@ static void start_unregistering(struct ctl_table_header *p)
                spin_unlock(&sysctl_lock);
                wait_for_completion(&wait);
                spin_lock(&sysctl_lock);
+        } else {
+                /* anything non-NULL; we'll never dereference it */
+                p->unregistering = ERR_PTR(-EINVAL);
        }
        /*
         * do not remove from the list until nobody holds it; walking the
@@ -1380,6 +1398,32 @@ static void start_unregistering(struct ctl_table_header *p)
        list_del_init(&p->ctl_entry);
 }
+void sysctl_head_get(struct ctl_table_header *head)
+{
+        spin_lock(&sysctl_lock);
+        head->count++;
+        spin_unlock(&sysctl_lock);
+}
+void sysctl_head_put(struct ctl_table_header *head)
+{
+        spin_lock(&sysctl_lock);
+        if (!--head->count)
+                kfree(head);
+        spin_unlock(&sysctl_lock);
+}
+struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+        if (!head)
+                BUG();
+        spin_lock(&sysctl_lock);
+        if (!use_table(head))
+                head = ERR_PTR(-ENOENT);
+        spin_unlock(&sysctl_lock);
+        return head;
+}
 void sysctl_head_finish(struct ctl_table_header *head)
 {
        if (!head)
@@ -1389,14 +1433,20 @@ void sysctl_head_finish(struct ctl_table_header *head)
        spin_unlock(&sysctl_lock);
 }
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+        struct ctl_table_set *set = &root->default_set;
+        if (root->lookup)
+                set = root->lookup(root, namespaces);
+        return set;
+}
 static struct list_head *
 lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
 {
-        struct list_head *header_list;
+        struct ctl_table_set *set = lookup_header_set(root, namespaces);
-        header_list = &root->header_list;
+        return &set->list;
-        if (root->lookup)
-                header_list = root->lookup(root, namespaces);
-        return header_list;
 }
 struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
@@ -1466,9 +1516,9 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
        int op = 0, rc;
        if (oldval)
-                op |= 004;
+                op |= MAY_READ;
        if (newval)
-                op |= 002;
+                op |= MAY_WRITE;
        if (sysctl_perm(root, table, op))
                return -EPERM;
@@ -1510,7 +1560,7 @@ repeat:
                if (n == table->ctl_name) {
                        int error;
                        if (table->child) {
-                                if (sysctl_perm(root, table, 001))
+                                if (sysctl_perm(root, table, MAY_EXEC))
                                        return -EPERM;
                                name++;
                                nlen--;
@@ -1585,7 +1635,7 @@ static int test_perm(int mode, int op)
                mode >>= 6;
        else if (in_egroup_p(0))
                mode >>= 3;
-        if ((mode & op & 0007) == op)
+        if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
                return 0;
        return -EACCES;
 }
@@ -1595,7 +1645,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
        int error;
        int mode;
-        error = security_sysctl(table, op);
+        error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
        if (error)
                return error;
@@ -1630,6 +1680,54 @@ static __init int sysctl_init(void)
 core_initcall(sysctl_init);
+static struct ctl_table *is_branch_in(struct ctl_table *branch,
+                                      struct ctl_table *table)
+{
+        struct ctl_table *p;
+        const char *s = branch->procname;
+        /* branch should have named subdirectory as its first element */
+        if (!s || !branch->child)
+                return NULL;
+        /* ... and nothing else */
+        if (branch[1].procname || branch[1].ctl_name)
+                return NULL;
+        /* table should contain subdirectory with the same name */
+        for (p = table; p->procname || p->ctl_name; p++) {
+                if (!p->child)
+                        continue;
+                if (p->procname && strcmp(p->procname, s) == 0)
+                        return p;
+        }
+        return NULL;
+}
+/* see if attaching q to p would be an improvement */
+static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
+{
+        struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
+        struct ctl_table *next;
+        int is_better = 0;
+        int not_in_parent = !p->attached_by;
+        while ((next = is_branch_in(by, to)) != NULL) {
+                if (by == q->attached_by)
+                        is_better = 1;
+                if (to == p->attached_by)
+                        not_in_parent = 1;
+                by = by->child;
+                to = next->child;
+        }
+        if (is_better && not_in_parent) {
+                q->attached_by = by;
+                q->attached_to = to;
+                q->parent = p;
+        }
+}
 /**
 * __register_sysctl_paths - register a sysctl hierarchy
 * @root: List of sysctl headers to register on
@@ -1706,10 +1804,10 @@ struct ctl_table_header *__register_sysctl_paths(
        struct nsproxy *namespaces,
        const struct ctl_path *path, struct ctl_table *table)
 {
-        struct list_head *header_list;
        struct ctl_table_header *header;
        struct ctl_table *new, **prevp;
        unsigned int n, npath;
+        struct ctl_table_set *set;
        /* Count the path components */
        for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
@@ -1751,6 +1849,7 @@ struct ctl_table_header *__register_sysctl_paths(
        header->unregistering = NULL;
        header->root = root;
        sysctl_set_parent(NULL, header->ctl_table);
+        header->count = 1;
 #ifdef CONFIG_SYSCTL_SYSCALL_CHECK
        if (sysctl_check_table(namespaces, header->ctl_table)) {
                kfree(header);
@@ -1758,8 +1857,20 @@ struct ctl_table_header *__register_sysctl_paths(
        }
 #endif
        spin_lock(&sysctl_lock);
-        header_list = lookup_header_list(root, namespaces);
+        header->set = lookup_header_set(root, namespaces);
-        list_add_tail(&header->ctl_entry, header_list);
+        header->attached_by = header->ctl_table;
+        header->attached_to = root_table;
+        header->parent = &root_table_header;
+        for (set = header->set; set; set = set->parent) {
+                struct ctl_table_header *p;
+                list_for_each_entry(p, &set->list, ctl_entry) {
+                        if (p->unregistering)
+                                continue;
+                        try_attach(p, header);
+                }
+        }
+        header->parent->count++;
+        list_add_tail(&header->ctl_entry, &header->set->list);
        spin_unlock(&sysctl_lock);
        return header;
@@ -1814,8 +1925,37 @@ void unregister_sysctl_table(struct ctl_table_header * header)
        spin_lock(&sysctl_lock);
        start_unregistering(header);
+        if (!--header->parent->count) {
+                WARN_ON(1);
+                kfree(header->parent);
+        }
+        if (!--header->count)
+                kfree(header);
+        spin_unlock(&sysctl_lock);
+}
+int sysctl_is_seen(struct ctl_table_header *p)
+{
+        struct ctl_table_set *set = p->set;
+        int res;
+        spin_lock(&sysctl_lock);
+        if (p->unregistering)
+                res = 0;
+        else if (!set->is_seen)
+                res = 1;
+        else
+                res = set->is_seen(set);
        spin_unlock(&sysctl_lock);
-        kfree(header);
+        return res;
+}
+void setup_sysctl_set(struct ctl_table_set *p,
+        struct ctl_table_set *parent,
+        int (*is_seen)(struct ctl_table_set *))
+{
+        INIT_LIST_HEAD(&p->list);
+        p->parent = parent ? parent : &sysctl_table_root.default_set;
+        p->is_seen = is_seen;
 }
 #else /* !CONFIG_SYSCTL */
@@ -1834,6 +1974,16 @@ void unregister_sysctl_table(struct ctl_table_header * table)
 {
 }
+void setup_sysctl_set(struct ctl_table_set *p,
+        struct ctl_table_set *parent,
+        int (*is_seen)(struct ctl_table_set *))
+{
+}
+void sysctl_head_put(struct ctl_table_header *head)
+{
+}
 #endif /* CONFIG_SYSCTL */
 /*
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c09350d564f2..c35da23ab8fb 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1532,6 +1532,8 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                        sysctl_check_leaf(namespaces, table, &fail);
                }
                sysctl_check_bin_path(table, &fail);
+                if (table->mode > 0777)
+                        set_fail(&fail, table, "bogus .mode");
                if (fail) {
                        set_fail(&fail, table, NULL);
                        error = -EINVAL;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a23517169a6..bd6be76303cf 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -35,7 +35,7 @@
 */
 #define TASKSTATS_CPUMASK_MAXLEN        (100+6*NR_CPUS)
-static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static DEFINE_PER_CPU(__u32, taskstats_seqnum);
 static int family_registered;
 struct kmem_cache *taskstats_cache;
@@ -301,7 +301,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
                return -EINVAL;
        if (isadd == REGISTER) {
-                for_each_cpu_mask(cpu, mask) {
+                for_each_cpu_mask_nr(cpu, mask) {
                        s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
                                         cpu_to_node(cpu));
                        if (!s)
@@ -320,7 +320,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
        /* Deregister or cleanup */
 cleanup:
-        for_each_cpu_mask(cpu, mask) {
+        for_each_cpu_mask_nr(cpu, mask) {
                listeners = &per_cpu(listener_array, cpu);
                down_write(&listeners->sem);
                list_for_each_entry_safe(s, tmp, &listeners->list, list) {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index dadde5361f32..093d4acf993b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -145,9 +145,9 @@ static void clocksource_watchdog(unsigned long data)
                 * Cycle through CPUs to check if the CPUs stay
                 * synchronized to each other.
                 */
-                int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
+                int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map);
-                if (next_cpu >= NR_CPUS)
+                if (next_cpu >= nr_cpu_ids)
                        next_cpu = first_cpu(cpu_online_map);
                watchdog_timer.expires += WATCHDOG_INTERVAL;
                add_timer_on(&watchdog_timer, next_cpu);
@@ -376,7 +376,8 @@ void clocksource_unregister(struct clocksource *cs)
 * Provides sysfs interface for listing current clocksource.
 */
 static ssize_t
-sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
+sysfs_show_current_clocksources(struct sys_device *dev,
+                                struct sysdev_attribute *attr, char *buf)
 {
        ssize_t count = 0;
@@ -397,6 +398,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
 * clocksource selction.
 */
 static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+                                          struct sysdev_attribute *attr,
                                          const char *buf, size_t count)
 {
        struct clocksource *ovr = NULL;
@@ -449,7 +451,9 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
 * Provides sysfs interface for listing registered clocksources
 */
 static ssize_t
-sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
+sysfs_show_available_clocksources(struct sys_device *dev,
+                                  struct sysdev_attribute *attr,
+                                  char *buf)
 {
        struct clocksource *src;
        ssize_t count = 0;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f48d0f09d32f..31463d370b94 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -399,8 +399,7 @@ again:
        mask = CPU_MASK_NONE;
        now = ktime_get();
        /* Find all expired events */
-        for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
+        for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) {
-             cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
                td = &per_cpu(tick_cpu_device, cpu);
                if (td->evtdev->next_event.tv64 <= now.tv64)
                        cpu_set(cpu, mask);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 4f3886562b8c..80c4336f4188 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -135,7 +135,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 */
 static void tick_setup_device(struct tick_device *td,
                              struct clock_event_device *newdev, int cpu,
-                              cpumask_t cpumask)
+                              const cpumask_t *cpumask)
 {
        ktime_t next_event;
        void (*handler)(struct clock_event_device *) = NULL;
@@ -169,8 +169,8 @@ static void tick_setup_device(struct tick_device *td,
         * When the device is not per cpu, pin the interrupt to the
         * current cpu:
         */
-        if (!cpus_equal(newdev->cpumask, cpumask))
+        if (!cpus_equal(newdev->cpumask, *cpumask))
-                irq_set_affinity(newdev->irq, cpumask);
+                irq_set_affinity(newdev->irq, *cpumask);
        /*
         * When global broadcasting is active, check if the current
@@ -196,7 +196,6 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        struct tick_device *td;
        int cpu, ret = NOTIFY_OK;
        unsigned long flags;
-        cpumask_t cpumask;
        spin_lock_irqsave(&tick_device_lock, flags);
@@ -206,10 +205,9 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        td = &per_cpu(tick_cpu_device, cpu);
        curdev = td->evtdev;
-        cpumask = cpumask_of_cpu(cpu);
        /* cpu local device ? */
-        if (!cpus_equal(newdev->cpumask, cpumask)) {
+        if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) {
                /*
                 * If the cpu affinity of the device interrupt can not
@@ -222,7 +220,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
                 * If we have a cpu local device already, do not replace it
                 * by a non cpu local device
                 */
-                if (curdev && cpus_equal(curdev->cpumask, cpumask))
+                if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu)))
                        goto out_bc;
        }
@@ -254,7 +252,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
                curdev = NULL;
        }
        clockevents_exchange_device(curdev, newdev);
-        tick_setup_device(td, newdev, cpu, cpumask);
+        tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu));
        if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
                tick_oneshot_notify();
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index beef7ccdf842..825b4c00fe44 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -140,8 +140,6 @@ void tick_nohz_update_jiffies(void)
        if (!ts->tick_stopped)
                return;
-        touch_softlockup_watchdog();
        cpu_clear(cpu, nohz_cpu_mask);
        now = ktime_get();
        ts->idle_waketime = now;
@@ -149,6 +147,8 @@ void tick_nohz_update_jiffies(void)
        local_irq_save(flags);
        tick_do_update_jiffies64(now);
        local_irq_restore(flags);
+        touch_softlockup_watchdog();
 }
 void tick_nohz_stop_idle(int cpu)
@@ -195,7 +195,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 * Called either from the idle loop or from irq_exit() when an idle period was
 * just interrupted by an interrupt which did not cause a reschedule.
 */
-void tick_nohz_stop_sched_tick(void)
+void tick_nohz_stop_sched_tick(int inidle)
 {
        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
        struct tick_sched *ts;
@@ -224,6 +224,11 @@ void tick_nohz_stop_sched_tick(void)
        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
                goto end;
+        if (!inidle && !ts->inidle)
+                goto end;
+        ts->inidle = 1;
        if (need_resched())
                goto end;
@@ -373,11 +378,14 @@ void tick_nohz_restart_sched_tick(void)
        local_irq_disable();
        tick_nohz_stop_idle(cpu);
-        if (!ts->tick_stopped) {
+        if (!ts->inidle || !ts->tick_stopped) {
+                ts->inidle = 0;
                local_irq_enable();
                return;
        }
+        ts->inidle = 0;
        rcu_exit_nohz();
        /* Update jiffies first */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4231a3dc224a..f6e3af31b403 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -587,7 +587,7 @@ static int __ftrace_modify_code(void *data)
 static void ftrace_run_update_code(int command)
 {
-        stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
+        stop_machine(__ftrace_modify_code, &command, NULL);
 }
 void ftrace_disable_daemon(void)
@@ -787,7 +787,7 @@ static int ftrace_update_code(void)
            !ftrace_enabled || !ftraced_trigger)
                return 0;
-        stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+        stop_machine(__ftrace_update_code, NULL, NULL);
        return 1;
 }
@@ -1564,7 +1564,7 @@ static int __init ftrace_dynamic_init(void)
        addr = (unsigned long)ftrace_record_ip;
-        stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+        stop_machine(ftrace_dyn_arch_init, &addr, NULL);
        /* ftrace_dyn_arch_init places the return code in addr */
        if (addr) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 868e121c8e38..8f3fb3db61c3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1183,7 +1183,6 @@ static void *find_next_entry_inc(struct trace_iterator *iter)
 static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct trace_iterator *iter = m->private;
-        void *last_ent = iter->ent;
        int i = (int)*pos;
        void *ent;
@@ -1203,9 +1202,6 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
        iter->pos = *pos;
-        if (last_ent && !ent)
-                seq_puts(m, "\n\nvim:ft=help\n");
        return ent;
 }
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 421d6fe3650e..ece6cfb649fa 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -253,12 +253,14 @@ void start_critical_timings(void)
        if (preempt_trace() || irq_trace())
                start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
+EXPORT_SYMBOL_GPL(start_critical_timings);
 void stop_critical_timings(void)
 {
        if (preempt_trace() || irq_trace())
                stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
+EXPORT_SYMBOL_GPL(stop_critical_timings);
 #ifdef CONFIG_IRQSOFF_TRACER
 #ifdef CONFIG_PROVE_LOCKING
@@ -337,12 +339,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
 #ifdef CONFIG_PREEMPT_TRACER
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-        stop_critical_timing(a0, a1);
+        if (preempt_trace())
+                stop_critical_timing(a0, a1);
 }
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-        start_critical_timing(a0, a1);
+        if (preempt_trace())
+                start_critical_timing(a0, a1);
 }
 #endif /* CONFIG_PREEMPT_TRACER */
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3c8d61df4474..e303ccb62cdf 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -26,7 +26,8 @@ static struct task_struct	*wakeup_task;
 static int                      wakeup_cpu;
 static unsigned                 wakeup_prio = -1;
-static DEFINE_SPINLOCK(wakeup_lock);
+static raw_spinlock_t wakeup_lock =
+        (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 static void __wakeup_reset(struct trace_array *tr);
@@ -56,7 +57,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
        if (unlikely(disabled != 1))
                goto out;
-        spin_lock_irqsave(&wakeup_lock, flags);
+        local_irq_save(flags);
+        __raw_spin_lock(&wakeup_lock);
        if (unlikely(!wakeup_task))
                goto unlock;
@@ -71,7 +73,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
        trace_function(tr, data, ip, parent_ip, flags);
 unlock:
-        spin_unlock_irqrestore(&wakeup_lock, flags);
+        __raw_spin_unlock(&wakeup_lock);
+        local_irq_restore(flags);
 out:
        atomic_dec(&data->disabled);
@@ -145,7 +148,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
        if (likely(disabled != 1))
                goto out;
-        spin_lock_irqsave(&wakeup_lock, flags);
+        local_irq_save(flags);
+        __raw_spin_lock(&wakeup_lock);
        /* We could race with grabbing wakeup_lock */
        if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -174,7 +178,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
 out_unlock:
        __wakeup_reset(tr);
-        spin_unlock_irqrestore(&wakeup_lock, flags);
+        __raw_spin_unlock(&wakeup_lock);
+        local_irq_restore(flags);
 out:
        atomic_dec(&tr->data[cpu]->disabled);
 }
@@ -209,8 +214,6 @@ static void __wakeup_reset(struct trace_array *tr)
        struct trace_array_cpu *data;
        int cpu;
-        assert_spin_locked(&wakeup_lock);
        for_each_possible_cpu(cpu) {
                data = tr->data[cpu];
                tracing_reset(data);
@@ -229,9 +232,11 @@ static void wakeup_reset(struct trace_array *tr)
 {
        unsigned long flags;
-        spin_lock_irqsave(&wakeup_lock, flags);
+        local_irq_save(flags);
+        __raw_spin_lock(&wakeup_lock);
        __wakeup_reset(tr);
-        spin_unlock_irqrestore(&wakeup_lock, flags);
+        __raw_spin_unlock(&wakeup_lock);
+        local_irq_restore(flags);
 }
 static void
@@ -252,7 +257,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
                goto out;
        /* interrupts should be off from try_to_wake_up */
-        spin_lock(&wakeup_lock);
+        __raw_spin_lock(&wakeup_lock);
        /* check for races. */
        if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -274,7 +279,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
                       CALLER_ADDR1, CALLER_ADDR2, flags);
 out_locked:
-        spin_unlock(&wakeup_lock);
+        __raw_spin_unlock(&wakeup_lock);
 out:
        atomic_dec(&tr->data[cpu]->disabled);
 }
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 2301e1e7c606..bb948e52ce20 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -161,7 +161,7 @@ static void timer_notify(struct pt_regs *regs, int cpu)
                __trace_special(tr, data, 2, regs->ip, 0);
                while (i < sample_max_depth) {
-                        frame.next_fp = 0;
+                        frame.next_fp = NULL;
                        frame.return_address = 0;
                        if (!copy_stack_frame(fp, &frame))
                                break;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 4ab1b584961b..8ebcd8532dfb 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -28,14 +28,14 @@
 void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 {
        struct timespec uptime, ts;
-        s64 ac_etime;
+        u64 ac_etime;
        BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
        /* calculate task elapsed time in timespec */
        do_posix_clock_monotonic_gettime(&uptime);
        ts = timespec_sub(uptime, tsk->start_time);
-        /* rebase elapsed time to usec */
+        /* rebase elapsed time to usec (should never be negative) */
        ac_etime = timespec_to_ns(&ts);
        do_div(ac_etime, NSEC_PER_USEC);
        stats->ac_etime = ac_etime;
@@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
        struct mm_struct *mm;
-        /* convert pages-jiffies to Mbyte-usec */
+        /* convert pages-usec to Mbyte-usec */
-        stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
+        stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
-        stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
+        stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
        mm = get_task_mm(p);
        if (mm) {
                /* adjust to KB unit */
@@ -94,10 +94,10 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
                stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
                mmput(mm);
        }
-        stats->read_char        = p->rchar;
+        stats->read_char        = p->ioac.rchar;
-        stats->write_char       = p->wchar;
+        stats->write_char       = p->ioac.wchar;
-        stats->read_syscalls    = p->syscr;
+        stats->read_syscalls    = p->ioac.syscr;
-        stats->write_syscalls   = p->syscw;
+        stats->write_syscalls   = p->ioac.syscw;
 #ifdef CONFIG_TASK_IO_ACCOUNTING
        stats->read_bytes       = p->ioac.read_bytes;
        stats->write_bytes      = p->ioac.write_bytes;
@@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 void acct_update_integrals(struct task_struct *tsk)
 {
        if (likely(tsk->mm)) {
-                long delta = cputime_to_jiffies(
+                cputime_t time, dtime;
-                        cputime_sub(tsk->stime, tsk->acct_stimexpd));
+                struct timeval value;
+                u64 delta;
+                time = tsk->stime + tsk->utime;
+                dtime = cputime_sub(time, tsk->acct_timexpd);
+                jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
+                delta = value.tv_sec;
+                delta = delta * USEC_PER_SEC + value.tv_usec;
                if (delta == 0)
                        return;
-                tsk->acct_stimexpd = tsk->stime;
+                tsk->acct_timexpd = time;
                tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
                tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
        }
@@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk)
 */
 void acct_clear_integrals(struct task_struct *tsk)
 {
-        tsk->acct_stimexpd = 0;
+        tsk->acct_timexpd = 0;
        tsk->acct_rss_mem1 = 0;
        tsk->acct_vm_mem1 = 0;
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ce7799540c91..4a26a1382df0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -125,7 +125,7 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 }
 static void insert_work(struct cpu_workqueue_struct *cwq,
-                                struct work_struct *work, int tail)
+                        struct work_struct *work, struct list_head *head)
 {
        set_wq_data(work, cwq);
        /*
@@ -133,21 +133,17 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
         * result of list_add() below, see try_to_grab_pending().
         */
        smp_wmb();
-        if (tail)
+        list_add_tail(&work->entry, head);
-                list_add_tail(&work->entry, &cwq->worklist);
-        else
-                list_add(&work->entry, &cwq->worklist);
        wake_up(&cwq->more_work);
 }
-/* Preempt must be disabled. */
 static void __queue_work(struct cpu_workqueue_struct *cwq,
                         struct work_struct *work)
 {
        unsigned long flags;
        spin_lock_irqsave(&cwq->lock, flags);
-        insert_work(cwq, work, 1);
+        insert_work(cwq, work, &cwq->worklist);
        spin_unlock_irqrestore(&cwq->lock, flags);
 }
@@ -163,17 +159,39 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 */
 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
+        int ret;
+        ret = queue_work_on(get_cpu(), wq, work);
+        put_cpu();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(queue_work);
+/**
+ * queue_work_on - queue work on specific cpu
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
+ *
+ * We queue the work to a specific CPU, the caller must ensure it
+ * can't go away.
+ */
+int
+queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
+{
        int ret = 0;
        if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
                BUG_ON(!list_empty(&work->entry));
-                __queue_work(wq_per_cpu(wq, get_cpu()), work);
+                __queue_work(wq_per_cpu(wq, cpu), work);
-                put_cpu();
                ret = 1;
        }
        return ret;
 }
-EXPORT_SYMBOL_GPL(queue_work);
+EXPORT_SYMBOL_GPL(queue_work_on);
 static void delayed_work_timer_fn(unsigned long __data)
 {
@@ -337,14 +355,14 @@ static void wq_barrier_func(struct work_struct *work)
 }
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
-                                        struct wq_barrier *barr, int tail)
+                        struct wq_barrier *barr, struct list_head *head)
 {
        INIT_WORK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
        init_completion(&barr->done);
-        insert_work(cwq, &barr->work, tail);
+        insert_work(cwq, &barr->work, head);
 }
 static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -364,7 +382,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
                active = 0;
                spin_lock_irq(&cwq->lock);
                if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
-                        insert_wq_barrier(cwq, &barr, 1);
+                        insert_wq_barrier(cwq, &barr, &cwq->worklist);
                        active = 1;
                }
                spin_unlock_irq(&cwq->lock);
@@ -397,11 +415,62 @@ void flush_workqueue(struct workqueue_struct *wq)
        might_sleep();
        lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
        lock_release(&wq->lockdep_map, 1, _THIS_IP_);
-        for_each_cpu_mask(cpu, *cpu_map)
+        for_each_cpu_mask_nr(cpu, *cpu_map)
                flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
+/**
+ * flush_work - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
+ *
+ * Returns false if @work has already terminated.
+ *
+ * It is expected that, prior to calling flush_work(), the caller has
+ * arranged for the work to not be requeued, otherwise it doesn't make
+ * sense to use this function.
+ */
+int flush_work(struct work_struct *work)
+{
+        struct cpu_workqueue_struct *cwq;
+        struct list_head *prev;
+        struct wq_barrier barr;
+        might_sleep();
+        cwq = get_wq_data(work);
+        if (!cwq)
+                return 0;
+        lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+        prev = NULL;
+        spin_lock_irq(&cwq->lock);
+        if (!list_empty(&work->entry)) {
+                /*
+                 * See the comment near try_to_grab_pending()->smp_rmb().
+                 * If it was re-queued under us we are not going to wait.
+                 */
+                smp_rmb();
+                if (unlikely(cwq != get_wq_data(work)))
+                        goto out;
+                prev = &work->entry;
+        } else {
+                if (cwq->current_work != work)
+                        goto out;
+                prev = &cwq->worklist;
+        }
+        insert_wq_barrier(cwq, &barr, prev->next);
+out:
+        spin_unlock_irq(&cwq->lock);
+        if (!prev)
+                return 0;
+        wait_for_completion(&barr.done);
+        return 1;
+}
+EXPORT_SYMBOL_GPL(flush_work);
 /*
 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
 * so this work can't be re-armed in any way.
@@ -449,7 +518,7 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
        spin_lock_irq(&cwq->lock);
        if (unlikely(cwq->current_work == work)) {
-                insert_wq_barrier(cwq, &barr, 0);
+                insert_wq_barrier(cwq, &barr, cwq->worklist.next);
                running = 1;
        }
        spin_unlock_irq(&cwq->lock);
@@ -477,7 +546,7 @@ static void wait_on_work(struct work_struct *work)
        wq = cwq->wq;
        cpu_map = wq_cpu_map(wq);
-        for_each_cpu_mask(cpu, *cpu_map)
+        for_each_cpu_mask_nr(cpu, *cpu_map)
                wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
@@ -553,6 +622,19 @@ int schedule_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(schedule_work);
+/*
+ * schedule_work_on - put work task on a specific cpu
+ * @cpu: cpu to put the work task on
+ * @work: job to be done
+ *
+ * This puts a job on a specific cpu
+ */
+int schedule_work_on(int cpu, struct work_struct *work)
+{
+        return queue_work_on(cpu, keventd_wq, work);
+}
+EXPORT_SYMBOL(schedule_work_on);
 /**
 * schedule_delayed_work - put work task in global workqueue after delay
 * @dwork: job to be done
@@ -607,10 +689,10 @@ int schedule_on_each_cpu(work_func_t func)
                struct work_struct *work = per_cpu_ptr(works, cpu);
                INIT_WORK(work, func);
-                set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
+                schedule_work_on(cpu, work);
-                __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
        }
-        flush_workqueue(keventd_wq);
+        for_each_online_cpu(cpu)
+                flush_work(per_cpu_ptr(works, cpu));
        put_online_cpus();
        free_percpu(works);
        return 0;
@@ -747,11 +829,22 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
                err = create_workqueue_thread(cwq, singlethread_cpu);
                start_workqueue_thread(cwq, -1);
        } else {
-                get_online_cpus();
+                cpu_maps_update_begin();
+                /*
+                 * We must place this wq on list even if the code below fails.
+                 * cpu_down(cpu) can remove cpu from cpu_populated_map before
+                 * destroy_workqueue() takes the lock, in that case we leak
+                 * cwq[cpu]->thread.
+                 */
                spin_lock(&workqueue_lock);
                list_add(&wq->list, &workqueues);
                spin_unlock(&workqueue_lock);
+                /*
+                 * We must initialize cwqs for each possible cpu even if we
+                 * are going to call destroy_workqueue() finally. Otherwise
+                 * cpu_up() can hit the uninitialized cwq once we drop the
+                 * lock.
+                 */
                for_each_possible_cpu(cpu) {
                        cwq = init_cpu_workqueue(wq, cpu);
                        if (err || !cpu_online(cpu))
@@ -759,7 +852,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
                        err = create_workqueue_thread(cwq, cpu);
                        start_workqueue_thread(cwq, cpu);
                }
-                put_online_cpus();
+                cpu_maps_update_done();
        }
        if (err) {
@@ -773,8 +866,8 @@ EXPORT_SYMBOL_GPL(__create_workqueue_key);
 static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
        /*
-         * Our caller is either destroy_workqueue() or CPU_DEAD,
+         * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
-         * get_online_cpus() protects cwq->thread.
+         * cpu_add_remove_lock protects cwq->thread.
         */
        if (cwq->thread == NULL)
                return;
@@ -784,7 +877,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
        flush_cpu_workqueue(cwq);
        /*
-         * If the caller is CPU_DEAD and cwq->worklist was not empty,
+         * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
         * a concurrent flush_workqueue() can insert a barrier after us.
         * However, in that case run_workqueue() won't return and check
         * kthread_should_stop() until it flushes all work_struct's.
@@ -808,14 +901,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
        const cpumask_t *cpu_map = wq_cpu_map(wq);
        int cpu;
-        get_online_cpus();
+        cpu_maps_update_begin();
        spin_lock(&workqueue_lock);
        list_del(&wq->list);
        spin_unlock(&workqueue_lock);
-        for_each_cpu_mask(cpu, *cpu_map)
+        for_each_cpu_mask_nr(cpu, *cpu_map)
                cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
-        put_online_cpus();
+        cpu_maps_update_done();
        free_percpu(wq->cpu_wq);
        kfree(wq);
@@ -829,6 +922,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        unsigned int cpu = (unsigned long)hcpu;
        struct cpu_workqueue_struct *cwq;
        struct workqueue_struct *wq;
+        int ret = NOTIFY_OK;
        action &= ~CPU_TASKS_FROZEN;
@@ -836,7 +930,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        case CPU_UP_PREPARE:
                cpu_set(cpu, cpu_populated_map);
        }
+undo:
        list_for_each_entry(wq, &workqueues, list) {
                cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -846,7 +940,9 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                                break;
                        printk(KERN_ERR "workqueue [%s] for %i failed\n",
                                wq->name, cpu);
-                        return NOTIFY_BAD;
+                        action = CPU_UP_CANCELED;
+                        ret = NOTIFY_BAD;
+                        goto undo;
                case CPU_ONLINE:
                        start_workqueue_thread(cwq, cpu);
@@ -854,7 +950,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                case CPU_UP_CANCELED:
                        start_workqueue_thread(cwq, -1);
-                case CPU_DEAD:
+                case CPU_POST_DEAD:
                        cleanup_workqueue_thread(cwq);
                        break;
                }
@@ -862,11 +958,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_CANCELED:
-        case CPU_DEAD:
+        case CPU_POST_DEAD:
                cpu_clear(cpu, cpu_populated_map);
        }
-        return NOTIFY_OK;
+        return ret;
 }
 void __init init_workqueues(void)