9 files changed, 90 insertions, 58 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 961d74044deb..00e8f2575512 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -166,9 +166,8 @@ static struct super_block *cpuset_sb = NULL;
 * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
 * (usually) grab cpuset_sem.  These are the two most performance
 * critical pieces of code here.  The exception occurs on exit(),
- * if the last task using a cpuset exits, and the cpuset was marked
+ * when a task in a notify_on_release cpuset exits.  Then cpuset_sem
- * notify_on_release.  In that case, the cpuset_sem is taken, the
+ * is taken, and if the cpuset count is zero, a usermode call made
- * path to the released cpuset calculated, and a usermode call made
 * to /sbin/cpuset_release_agent with the name of the cpuset (path
 * relative to the root of cpuset file system) as the argument.
 *
@@ -1404,6 +1403,18 @@ void cpuset_fork(struct task_struct *tsk)
 *
 * Description: Detach cpuset from @tsk and release it.
 *
+ * Note that cpusets marked notify_on_release force every task
+ * in them to take the global cpuset_sem semaphore when exiting.
+ * This could impact scaling on very large systems.  Be reluctant
+ * to use notify_on_release cpusets where very high task exit
+ * scaling is required on large systems.
+ *
+ * Don't even think about derefencing 'cs' after the cpuset use
+ * count goes to zero, except inside a critical section guarded
+ * by the cpuset_sem semaphore.  If you don't hold cpuset_sem,
+ * then a zero cpuset use count is a license to any other task to
+ * nuke the cpuset immediately.
+ *
 **/
 void cpuset_exit(struct task_struct *tsk)
@@ -1415,10 +1426,13 @@ void cpuset_exit(struct task_struct *tsk)
        tsk->cpuset = NULL;
        task_unlock(tsk);
-        if (atomic_dec_and_test(&cs->count)) {
+        if (notify_on_release(cs)) {
                down(&cpuset_sem);
-                check_for_release(cs);
+                if (atomic_dec_and_test(&cs->count))
+                        check_for_release(cs);
                up(&cpuset_sem);
+        } else {
+                atomic_dec(&cs->count);
        }
 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 2fb0e46e11f3..436c7d93c00a 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -30,6 +30,7 @@
 */
 irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
        [0 ... NR_IRQS-1] = {
+                .status = IRQ_DISABLED,
                .handler = &no_irq_type,
                .lock = SPIN_LOCK_UNLOCKED
        }
@@ -118,8 +119,6 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
                 */
                desc->handler->ack(irq);
                action_ret = handle_IRQ_event(irq, regs, desc->action);
-                if (!noirqdebug)
-                        note_interrupt(irq, desc, action_ret);
                desc->handler->end(irq);
                return 1;
        }
diff --git a/kernel/module.c b/kernel/module.c
index 5734ab09d3f9..83b3d376708c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1758,6 +1758,7 @@ sys_init_module(void __user *umod,
                const char __user *uargs)
 {
        struct module *mod;
+        mm_segment_t old_fs = get_fs();
        int ret = 0;
        /* Must have permission */
@@ -1775,6 +1776,9 @@ sys_init_module(void __user *umod,
                return PTR_ERR(mod);
        }
+        /* flush the icache in correct context */
+        set_fs(KERNEL_DS);
        /* Flush the instruction cache, since we've played with text */
        if (mod->module_init)
                flush_icache_range((unsigned long)mod->module_init,
@@ -1783,6 +1787,8 @@ sys_init_module(void __user *umod,
        flush_icache_range((unsigned long)mod->module_core,
                           (unsigned long)mod->module_core + mod->core_size);
+        set_fs(old_fs);
        /* Now sew it into the lists.  They won't access us, since
           strong_try_module_get() will fail. */
        stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7960ddf04a57..4cdebc972ff2 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -156,14 +156,14 @@ static int enter_state(suspend_state_t state)
                goto Unlock;
        }
-        pr_debug("PM: Preparing system for suspend\n");
+        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
        if ((error = suspend_prepare(state)))
                goto Unlock;
-        pr_debug("PM: Entering state.\n");
+        pr_debug("PM: Entering %s sleep\n", pm_states[state]);
        error = suspend_enter(state);
-        pr_debug("PM: Finishing up.\n");
+        pr_debug("PM: Finishing wakeup.\n");
        suspend_finish(state);
 Unlock:
        up(&pm_sem);
diff --git a/kernel/printk.c b/kernel/printk.c
index 290a07ce2c8a..01b58d7d17ff 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -160,42 +160,6 @@ static int __init console_setup(char *str)
 __setup("console=", console_setup);
-/**
- * add_preferred_console - add a device to the list of preferred consoles.
- *
- * The last preferred console added will be used for kernel messages
- * and stdin/out/err for init.  Normally this is used by console_setup
- * above to handle user-supplied console arguments; however it can also
- * be used by arch-specific code either to override the user or more
- * commonly to provide a default console (ie from PROM variables) when
- * the user has not supplied one.
- */
-int __init add_preferred_console(char *name, int idx, char *options)
-{
-        struct console_cmdline *c;
-        int i;
-        /*
-         *      See if this tty is not yet registered, and
-         *      if we have a slot free.
-         */
-        for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
-                if (strcmp(console_cmdline[i].name, name) == 0 &&
-                          console_cmdline[i].index == idx) {
-                                selected_console = i;
-                                return 0;
-                }
-        if (i == MAX_CMDLINECONSOLES)
-                return -E2BIG;
-        selected_console = i;
-        c = &console_cmdline[i];
-        memcpy(c->name, name, sizeof(c->name));
-        c->name[sizeof(c->name) - 1] = 0;
-        c->options = options;
-        c->index = idx;
-        return 0;
-}
 static int __init log_buf_len_setup(char *str)
 {
        unsigned long size = memparse(str, &str);
@@ -671,6 +635,42 @@ static void call_console_drivers(unsigned long start, unsigned long end) {}
 #endif
 /**
+ * add_preferred_console - add a device to the list of preferred consoles.
+ *
+ * The last preferred console added will be used for kernel messages
+ * and stdin/out/err for init.  Normally this is used by console_setup
+ * above to handle user-supplied console arguments; however it can also
+ * be used by arch-specific code either to override the user or more
+ * commonly to provide a default console (ie from PROM variables) when
+ * the user has not supplied one.
+ */
+int __init add_preferred_console(char *name, int idx, char *options)
+{
+        struct console_cmdline *c;
+        int i;
+        /*
+         *      See if this tty is not yet registered, and
+         *      if we have a slot free.
+         */
+        for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+                if (strcmp(console_cmdline[i].name, name) == 0 &&
+                          console_cmdline[i].index == idx) {
+                                selected_console = i;
+                                return 0;
+                }
+        if (i == MAX_CMDLINECONSOLES)
+                return -E2BIG;
+        selected_console = i;
+        c = &console_cmdline[i];
+        memcpy(c->name, name, sizeof(c->name));
+        c->name[sizeof(c->name) - 1] = 0;
+        c->options = options;
+        c->index = idx;
+        return 0;
+}
+/**
 * acquire_console_sem - lock the console system for exclusive use.
 *
 * Acquires a semaphore which guarantees that the caller has
diff --git a/kernel/profile.c b/kernel/profile.c
index 0221a50ca867..ad8cbb75ffa2 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -49,15 +49,19 @@ static DECLARE_MUTEX(profile_flip_mutex);
 static int __init profile_setup(char * str)
 {
+        static char __initdata schedstr[] = "schedule";
        int par;
-        if (!strncmp(str, "schedule", 8)) {
+        if (!strncmp(str, schedstr, strlen(schedstr))) {
                prof_on = SCHED_PROFILING;
-                printk(KERN_INFO "kernel schedule profiling enabled\n");
+                if (str[strlen(schedstr)] == ',')
-                if (str[7] == ',')
+                        str += strlen(schedstr) + 1;
-                        str += 8;
+                if (get_option(&str, &par))
-        }
+                        prof_shift = par;
-        if (get_option(&str,&par)) {
+                printk(KERN_INFO
+                        "kernel schedule profiling enabled (shift: %ld)\n",
+                        prof_shift);
+        } else if (get_option(&str, &par)) {
                prof_shift = par;
                prof_on = CPU_PROFILING;
                printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
diff --git a/kernel/sched.c b/kernel/sched.c
index 0dc3158667a2..66b2ed784822 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4243,7 +4243,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
        /* No more Mr. Nice Guy. */
        if (dest_cpu == NR_CPUS) {
-                tsk->cpus_allowed = cpuset_cpus_allowed(tsk);
+                cpus_setall(tsk->cpus_allowed);
                dest_cpu = any_online_cpu(tsk->cpus_allowed);
                /*
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f3debc77c5b..b3c24c732c5a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -522,7 +522,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 {
        int sig = 0;
-        sig = next_signal(pending, mask);
+        /* SIGKILL must have priority, otherwise it is quite easy
+         * to create an unkillable process, sending sig < SIGKILL
+         * to self */
+        if (unlikely(sigismember(&pending->signal, SIGKILL))) {
+                if (!sigismember(mask, SIGKILL))
+                        sig = SIGKILL;
+        }
+        if (likely(!sig))
+                sig = next_signal(pending, mask);
        if (sig) {
                if (current->notifier) {
                        if (sigismember(current->notifier_mask, sig)) {
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index e15ed17863f1..0c3f9d8bbe17 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -294,7 +294,7 @@ EXPORT_SYMBOL(_spin_unlock_irq);
 void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 {
        _raw_spin_unlock(lock);
-        preempt_enable();
+        preempt_enable_no_resched();
        local_bh_enable();
 }
 EXPORT_SYMBOL(_spin_unlock_bh);
@@ -318,7 +318,7 @@ EXPORT_SYMBOL(_read_unlock_irq);
 void __lockfunc _read_unlock_bh(rwlock_t *lock)
 {
        _raw_read_unlock(lock);
-        preempt_enable();
+        preempt_enable_no_resched();
        local_bh_enable();
 }
 EXPORT_SYMBOL(_read_unlock_bh);
@@ -342,7 +342,7 @@ EXPORT_SYMBOL(_write_unlock_irq);
 void __lockfunc _write_unlock_bh(rwlock_t *lock)
 {
        _raw_write_unlock(lock);
-        preempt_enable();
+        preempt_enable_no_resched();
        local_bh_enable();
 }
 EXPORT_SYMBOL(_write_unlock_bh);
@@ -354,7 +354,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
        if (_raw_spin_trylock(lock))
                return 1;
-        preempt_enable();
+        preempt_enable_no_resched();
        local_bh_enable();
        return 0;
 }