37 files changed, 1866 insertions, 680 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d237..ff4dc02ce170 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 4168f631868e..b756f527497e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -165,7 +165,7 @@ out:
 }
 /*
- * Close the old accouting file (if currently open) and then replace
+ * Close the old accounting file (if currently open) and then replace
 * it with file (if non-NULL).
 *
 * NOTE: acct_globals.lock MUST be held on entry and exit.
@@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file)
        }
 }
-/*
+/**
- *  sys_acct() is the only system call needed to implement process
+ * sys_acct - enable/disable process accounting
- *  accounting. It takes the name of the file where accounting records
+ * @name: file name for accounting records or NULL to shutdown accounting
- *  should be written. If the filename is NULL, accounting will be
+ *
- *  shutdown.
+ * Returns 0 for success or negative errno values for failure.
+ *
+ * sys_acct() is the only system call needed to implement process
+ * accounting. It takes the name of the file where accounting records
+ * should be written. If the filename is NULL, accounting will be
+ * shutdown.
 */
 asmlinkage long sys_acct(const char __user *name)
 {
@@ -220,7 +225,7 @@ asmlinkage long sys_acct(const char __user *name)
                        return (PTR_ERR(tmp));
                }
                /* Difference from BSD - they don't do O_APPEND */
-                file = filp_open(tmp, O_WRONLY|O_APPEND, 0);
+                file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
                putname(tmp);
                if (IS_ERR(file)) {
                        return (PTR_ERR(file));
@@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name)
        return (0);
 }
-/*
+/**
- * If the accouting is turned on for a file in the filesystem pointed
+ * acct_auto_close - turn off a filesystem's accounting if it is on
- * to by sb, turn accouting off.
+ * @sb: super block for the filesystem
+ *
+ * If the accounting is turned on for a file in the filesystem pointed
+ * to by sb, turn accounting off.
 */
 void acct_auto_close(struct super_block *sb)
 {
@@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file)
        set_fs(fs);
 }
-/*
+/**
 * acct_process - now just a wrapper around do_acct_process
+ * @exitcode: task exit code
+ *
+ * handles process accounting for an exiting task
 */
 void acct_process(long exitcode)
 {
@@ -530,9 +541,9 @@ void acct_process(long exitcode)
 }
-/*
+/**
- * acct_update_integrals
+ * acct_update_integrals - update mm integral fields in task_struct
- *    -  update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
 */
 void acct_update_integrals(struct task_struct *tsk)
 {
@@ -547,9 +558,9 @@ void acct_update_integrals(struct task_struct *tsk)
        }
 }
-/*
+/**
- * acct_clear_integrals
+ * acct_clear_integrals - clear the mm integral fields in task_struct
- *    - clear the mm integral fields in task_struct
+ * @tsk: task_struct whose accounting fields are cleared
 */
 void acct_clear_integrals(struct task_struct *tsk)
 {
diff --git a/kernel/audit.c b/kernel/audit.c
index 7f0699790d46..aefa73a8a586 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -79,6 +79,8 @@ static int	audit_rate_limit;
 /* Number of outstanding audit_buffers allowed. */
 static int      audit_backlog_limit = 64;
+static int      audit_backlog_wait_time = 60 * HZ;
+static int      audit_backlog_wait_overflow = 0;
 /* The identity of the user shutting down the audit system. */
 uid_t           audit_sig_uid = -1;
@@ -106,18 +108,12 @@ static LIST_HEAD(audit_freelist);
 static struct sk_buff_head audit_skb_queue;
 static struct task_struct *kauditd_task;
 static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
+static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
-/* There are three lists of rules -- one to search at task creation
- * time, one to search at syscall entry time, and another to search at
- * syscall exit time. */
-static LIST_HEAD(audit_tsklist);
-static LIST_HEAD(audit_entlist);
-static LIST_HEAD(audit_extlist);
 /* The netlink socket is only to be read by 1 CPU, which lets us assume
 * that list additions and deletions never happen simultaneously in
 * auditsc.c */
-static DECLARE_MUTEX(audit_netlink_sem);
+DECLARE_MUTEX(audit_netlink_sem);
 /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
 * audit records.  Since printk uses a 1024 byte buffer, this buffer
@@ -137,6 +133,7 @@ struct audit_buffer {
        struct list_head     list;
        struct sk_buff       *skb;      /* formatted skb ready to send */
        struct audit_context *ctx;      /* NULL or associated context */
+        int                  gfp_mask;
 };
 static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
@@ -145,11 +142,6 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
        nlh->nlmsg_pid = pid;
 }
-struct audit_entry {
-        struct list_head  list;
-        struct audit_rule rule;
-};
 static void audit_panic(const char *message)
 {
        switch (audit_failure)
@@ -233,7 +225,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid)
 {
        int old          = audit_rate_limit;
        audit_rate_limit = limit;
-        audit_log(NULL, AUDIT_CONFIG_CHANGE, 
+        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 
                        "audit_rate_limit=%d old=%d by auid=%u",
                        audit_rate_limit, old, loginuid);
        return old;
@@ -243,7 +235,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid)
 {
        int old          = audit_backlog_limit;
        audit_backlog_limit = limit;
-        audit_log(NULL, AUDIT_CONFIG_CHANGE,
+        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
                        "audit_backlog_limit=%d old=%d by auid=%u",
                        audit_backlog_limit, old, loginuid);
        return old;
@@ -255,7 +247,7 @@ static int audit_set_enabled(int state, uid_t loginuid)
        if (state != 0 && state != 1)
                return -EINVAL;
        audit_enabled = state;
-        audit_log(NULL, AUDIT_CONFIG_CHANGE,
+        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
                        "audit_enabled=%d old=%d by auid=%u",
                        audit_enabled, old, loginuid);
        return old;
@@ -269,7 +261,7 @@ static int audit_set_failure(int state, uid_t loginuid)
            && state != AUDIT_FAIL_PANIC)
                return -EINVAL;
        audit_failure = state;
-        audit_log(NULL, AUDIT_CONFIG_CHANGE,
+        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
                        "audit_failure=%d old=%d by auid=%u",
                        audit_failure, old, loginuid);
        return old;
@@ -281,6 +273,7 @@ int kauditd_thread(void *dummy)
        while (1) {
                skb = skb_dequeue(&audit_skb_queue);
+                wake_up(&audit_backlog_wait);
                if (skb) {
                        if (audit_pid) {
                                int err = netlink_unicast(audit_sock, skb, audit_pid, 0);
@@ -290,7 +283,7 @@ int kauditd_thread(void *dummy)
                                        audit_pid = 0;
                                }
                        } else {
-                                printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0));
+                                printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
                                kfree_skb(skb);
                        }
                } else {
@@ -423,7 +416,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (status_get->mask & AUDIT_STATUS_PID) {
                        int old   = audit_pid;
                        audit_pid = status_get->pid;
-                        audit_log(NULL, AUDIT_CONFIG_CHANGE,
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
                                "audit_pid=%d old=%d by auid=%u",
                                  audit_pid, old, loginuid);
                }
@@ -435,15 +428,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                break;
        case AUDIT_USER:
        case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
-                ab = audit_log_start(NULL, msg_type);
+                if (!audit_enabled && msg_type != AUDIT_USER_AVC)
-                if (!ab)
+                        return 0;
-                        break;  /* audit_panic has been called */
-                audit_log_format(ab,
+                err = audit_filter_user(&NETLINK_CB(skb), msg_type);
-                                 "user pid=%d uid=%u auid=%u"
+                if (err == 1) {
-                                 " msg='%.1024s'",
+                        err = 0;
-                                 pid, uid, loginuid, (char *)data);
+                        ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
-                audit_set_pid(ab, pid);
+                        if (ab) {
-                audit_log_end(ab);
+                                audit_log_format(ab,
+                                                 "user pid=%d uid=%u auid=%u msg='%.1024s'",
+                                                 pid, uid, loginuid, (char *)data);
+                                audit_set_pid(ab, pid);
+                                audit_log_end(ab);
+                        }
+                }
                break;
        case AUDIT_ADD:
        case AUDIT_DEL:
@@ -523,7 +522,7 @@ static int __init audit_init(void)
        skb_queue_head_init(&audit_skb_queue);
        audit_initialized = 1;
        audit_enabled = audit_default;
-        audit_log(NULL, AUDIT_KERNEL, "initialized");
+        audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
        return 0;
 }
 __initcall(audit_init);
@@ -561,7 +560,7 @@ static void audit_buffer_free(struct audit_buffer *ab)
 }
 static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
-                                                int gfp_mask, int type)
+                                                gfp_t gfp_mask, int type)
 {
        unsigned long flags;
        struct audit_buffer *ab = NULL;
@@ -587,6 +586,7 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
                goto err;
        ab->ctx = ctx;
+        ab->gfp_mask = gfp_mask;
        nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
        nlh->nlmsg_type = type;
        nlh->nlmsg_flags = 0;
@@ -606,26 +606,27 @@ err:
 * (timestamp,serial) tuple is unique for each syscall and is live from
 * syscall entry to syscall exit.
 *
- * Atomic values are only guaranteed to be 24-bit, so we count down.
- *
 * NOTE: Another possibility is to store the formatted records off the
 * audit context (for those records that have a context), and emit them
 * all at syscall exit.  However, this could delay the reporting of
 * significant errors until syscall exit (or never, if the system
 * halts). */
 unsigned int audit_serial(void)
 {
-        static atomic_t serial = ATOMIC_INIT(0xffffff);
+        static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
-        unsigned int a, b;
+        static unsigned int serial = 0;
+        unsigned long flags;
+        unsigned int ret;
+        spin_lock_irqsave(&serial_lock, flags);
        do {
-                a = atomic_read(&serial);
+                ret = ++serial;
-                if (atomic_dec_and_test(&serial))
+        } while (unlikely(!ret));
-                        atomic_set(&serial, 0xffffff);
+        spin_unlock_irqrestore(&serial_lock, flags);
-                b = atomic_read(&serial);
-        } while (b != a - 1);
-        return 0xffffff - b;
+        return ret;
 }
 static inline void audit_get_stamp(struct audit_context *ctx, 
@@ -645,17 +646,43 @@ static inline void audit_get_stamp(struct audit_context *ctx,
 * syscall, then the syscall is marked as auditable and an audit record
 * will be written at syscall exit.  If there is no associated task, tsk
 * should be NULL. */
-struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
+struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask,
+                                     int type)
 {
        struct audit_buffer     *ab     = NULL;
        struct timespec         t;
        unsigned int            serial;
+        int reserve;
+        unsigned long timeout_start = jiffies;
        if (!audit_initialized)
                return NULL;
-        if (audit_backlog_limit
+        if (gfp_mask & __GFP_WAIT)
-            && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
+                reserve = 0;
+        else
+                reserve = 5; /* Allow atomic callers to go up to five 
+                                entries over the normal backlog limit */
+        while (audit_backlog_limit
+               && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
+                if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
+                    && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
+                        /* Wait for auditd to drain the queue a little */
+                        DECLARE_WAITQUEUE(wait, current);
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        add_wait_queue(&audit_backlog_wait, &wait);
+                        if (audit_backlog_limit &&
+                            skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+                                schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
+                        __set_current_state(TASK_RUNNING);
+                        remove_wait_queue(&audit_backlog_wait, &wait);
+                        continue;
+                }
                if (audit_rate_check())
                        printk(KERN_WARNING
                               "audit: audit_backlog=%d > "
@@ -663,10 +690,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
                               skb_queue_len(&audit_skb_queue),
                               audit_backlog_limit);
                audit_log_lost("backlog limit exceeded");
+                audit_backlog_wait_time = audit_backlog_wait_overflow;
+                wake_up(&audit_backlog_wait);
                return NULL;
        }
-        ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type);
+        ab = audit_buffer_alloc(ctx, gfp_mask, type);
        if (!ab) {
                audit_log_lost("out of memory in audit_log_start");
                return NULL;
@@ -690,7 +719,7 @@ static inline int audit_expand(struct audit_buffer *ab, int extra)
 {
        struct sk_buff *skb = ab->skb;
        int ret = pskb_expand_head(skb, skb_headroom(skb), extra,
-                                   GFP_ATOMIC);
+                                   ab->gfp_mask);
        if (ret < 0) {
                audit_log_lost("out of memory in audit_expand");
                return 0;
@@ -809,7 +838,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
                audit_log_format(ab, " %s", prefix);
        /* We will allow 11 spaces for ' (deleted)' to be appended */
-        path = kmalloc(PATH_MAX+11, GFP_KERNEL);
+        path = kmalloc(PATH_MAX+11, ab->gfp_mask);
        if (!path) {
                audit_log_format(ab, "<no memory>");
                return;
@@ -841,7 +870,7 @@ void audit_log_end(struct audit_buffer *ab)
                        ab->skb = NULL;
                        wake_up_interruptible(&kauditd_wait);
                } else {
-                        printk("%s\n", ab->skb->data + NLMSG_SPACE(0));
+                        printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0));
                }
        }
        audit_buffer_free(ab);
@@ -850,12 +879,13 @@ void audit_log_end(struct audit_buffer *ab)
 /* Log an audit record.  This is a convenience function that calls
 * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
 * called in any context. */
-void audit_log(struct audit_context *ctx, int type, const char *fmt, ...)
+void audit_log(struct audit_context *ctx, int gfp_mask, int type, 
+               const char *fmt, ...)
 {
        struct audit_buffer *ab;
        va_list args;
-        ab = audit_log_start(ctx, type);
+        ab = audit_log_start(ctx, gfp_mask, type);
        if (ab) {
                va_start(args, fmt);
                audit_log_vformat(ab, fmt, args);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e75f84e1a1a0..88696f639aab 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -39,6 +39,9 @@
 #include <linux/audit.h>
 #include <linux/personality.h>
 #include <linux/time.h>
+#include <linux/kthread.h>
+#include <linux/netlink.h>
+#include <linux/compiler.h>
 #include <asm/unistd.h>
 /* 0 = no checking
@@ -95,6 +98,7 @@ struct audit_names {
        uid_t           uid;
        gid_t           gid;
        dev_t           rdev;
+        unsigned        flags;
 };
 struct audit_aux_data {
@@ -167,9 +171,16 @@ struct audit_context {
 /* There are three lists of rules -- one to search at task creation
 * time, one to search at syscall entry time, and another to search at
 * syscall exit time. */
-static LIST_HEAD(audit_tsklist);
+static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
-static LIST_HEAD(audit_entlist);
+        LIST_HEAD_INIT(audit_filter_list[0]),
-static LIST_HEAD(audit_extlist);
+        LIST_HEAD_INIT(audit_filter_list[1]),
+        LIST_HEAD_INIT(audit_filter_list[2]),
+        LIST_HEAD_INIT(audit_filter_list[3]),
+        LIST_HEAD_INIT(audit_filter_list[4]),
+#if AUDIT_NR_FILTERS != 5
+#error Fix audit_filter_list initialiser
+#endif
+};
 struct audit_entry {
        struct list_head  list;
@@ -179,9 +190,36 @@ struct audit_entry {
 extern int audit_pid;
+/* Copy rule from user-space to kernel-space.  Called from 
+ * audit_add_rule during AUDIT_ADD. */
+static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+{
+        int i;
+        if (s->action != AUDIT_NEVER
+            && s->action != AUDIT_POSSIBLE
+            && s->action != AUDIT_ALWAYS)
+                return -1;
+        if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
+                return -1;
+        if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
+                return -1;
+        d->flags        = s->flags;
+        d->action       = s->action;
+        d->field_count  = s->field_count;
+        for (i = 0; i < d->field_count; i++) {
+                d->fields[i] = s->fields[i];
+                d->values[i] = s->values[i];
+        }
+        for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+        return 0;
+}
 /* Check to see if two rules are identical.  It is called from
+ * audit_add_rule during AUDIT_ADD and 
 * audit_del_rule during AUDIT_DEL. */
-static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
 {
        int i;
@@ -210,19 +248,37 @@ static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
 /* Note that audit_add_rule and audit_del_rule are called via
 * audit_receive() in audit.c, and are protected by
 * audit_netlink_sem. */
-static inline int audit_add_rule(struct audit_entry *entry,
+static inline int audit_add_rule(struct audit_rule *rule,
-                                 struct list_head *list)
+                                  struct list_head *list)
 {
-        if (entry->rule.flags & AUDIT_PREPEND) {
+        struct audit_entry  *entry;
-                entry->rule.flags &= ~AUDIT_PREPEND;
+        /* Do not use the _rcu iterator here, since this is the only
+         * addition routine. */
+        list_for_each_entry(entry, list, list) {
+                if (!audit_compare_rule(rule, &entry->rule)) {
+                        return -EEXIST;
+                }
+        }
+        if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
+                return -ENOMEM;
+        if (audit_copy_rule(&entry->rule, rule)) {
+                kfree(entry);
+                return -EINVAL;
+        }
+        if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
+                entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
                list_add_rcu(&entry->list, list);
        } else {
                list_add_tail_rcu(&entry->list, list);
        }
        return 0;
 }
-static void audit_free_rule(struct rcu_head *head)
+static inline void audit_free_rule(struct rcu_head *head)
 {
        struct audit_entry *e = container_of(head, struct audit_entry, rcu);
        kfree(e);
@@ -245,82 +301,82 @@ static inline int audit_del_rule(struct audit_rule *rule,
                        return 0;
                }
        }
-        return -EFAULT;         /* No matching rule */
+        return -ENOENT;         /* No matching rule */
 }
-/* Copy rule from user-space to kernel-space.  Called during
+static int audit_list_rules(void *_dest)
- * AUDIT_ADD. */
-static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
 {
+        int pid, seq;
+        int *dest = _dest;
+        struct audit_entry *entry;
        int i;
-        if (s->action != AUDIT_NEVER
+        pid = dest[0];
-            && s->action != AUDIT_POSSIBLE
+        seq = dest[1];
-            && s->action != AUDIT_ALWAYS)
+        kfree(dest);
-                return -1;
-        if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
-                return -1;
-        d->flags        = s->flags;
+        down(&audit_netlink_sem);
-        d->action       = s->action;
-        d->field_count  = s->field_count;
+        /* The *_rcu iterators not needed here because we are
-        for (i = 0; i < d->field_count; i++) {
+           always called with audit_netlink_sem held. */
-                d->fields[i] = s->fields[i];
+        for (i=0; i<AUDIT_NR_FILTERS; i++) {
-                d->values[i] = s->values[i];
+                list_for_each_entry(entry, &audit_filter_list[i], list)
+                        audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+                                         &entry->rule, sizeof(entry->rule));
        }
-        for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+        audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+        
+        up(&audit_netlink_sem);
        return 0;
 }
 int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                                                        uid_t loginuid)
 {
-        u32                flags;
+        struct task_struct *tsk;
-        struct audit_entry *entry;
+        int *dest;
        int                err = 0;
+        unsigned listnr;
        switch (type) {
        case AUDIT_LIST:
-                /* The *_rcu iterators not needed here because we are
+                /* We can't just spew out the rules here because we might fill
-                   always called with audit_netlink_sem held. */
+                 * the available socket buffer space and deadlock waiting for
-                list_for_each_entry(entry, &audit_tsklist, list)
+                 * auditctl to read from it... which isn't ever going to
-                        audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+                 * happen if we're actually running in the context of auditctl
-                                         &entry->rule, sizeof(entry->rule));
+                 * trying to _send_ the stuff */
-                list_for_each_entry(entry, &audit_entlist, list)
+                 
-                        audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+                dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
-                                         &entry->rule, sizeof(entry->rule));
+                if (!dest)
-                list_for_each_entry(entry, &audit_extlist, list)
+                        return -ENOMEM;
-                        audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+                dest[0] = pid;
-                                         &entry->rule, sizeof(entry->rule));
+                dest[1] = seq;
-                audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+                tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
+                if (IS_ERR(tsk)) {
+                        kfree(dest);
+                        err = PTR_ERR(tsk);
+                }
                break;
        case AUDIT_ADD:
-                if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
+                listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
-                        return -ENOMEM;
+                if (listnr >= AUDIT_NR_FILTERS)
-                if (audit_copy_rule(&entry->rule, data)) {
-                        kfree(entry);
                        return -EINVAL;
-                }
-                flags = entry->rule.flags;
+                err = audit_add_rule(data, &audit_filter_list[listnr]);
-                if (!err && (flags & AUDIT_PER_TASK))
+                if (!err)
-                        err = audit_add_rule(entry, &audit_tsklist);
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-                if (!err && (flags & AUDIT_AT_ENTRY))
+                                  "auid=%u added an audit rule\n", loginuid);
-                        err = audit_add_rule(entry, &audit_entlist);
-                if (!err && (flags & AUDIT_AT_EXIT))
-                        err = audit_add_rule(entry, &audit_extlist);
-                audit_log(NULL, AUDIT_CONFIG_CHANGE, 
-                                "auid=%u added an audit rule\n", loginuid);
                break;
        case AUDIT_DEL:
-                flags =((struct audit_rule *)data)->flags;
+                listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
-                if (!err && (flags & AUDIT_PER_TASK))
+                if (listnr >= AUDIT_NR_FILTERS)
-                        err = audit_del_rule(data, &audit_tsklist);
+                        return -EINVAL;
-                if (!err && (flags & AUDIT_AT_ENTRY))
-                        err = audit_del_rule(data, &audit_entlist);
+                err = audit_del_rule(data, &audit_filter_list[listnr]);
-                if (!err && (flags & AUDIT_AT_EXIT))
+                if (!err)
-                        err = audit_del_rule(data, &audit_extlist);
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-                audit_log(NULL, AUDIT_CONFIG_CHANGE,
+                                  "auid=%u removed an audit rule\n", loginuid);
-                                "auid=%u removed an audit rule\n", loginuid);
                break;
        default:
                return -EINVAL;
@@ -384,8 +440,12 @@ static int audit_filter_rules(struct task_struct *tsk,
                                result = (ctx->return_code == value);
                        break;
                case AUDIT_SUCCESS:
-                        if (ctx && ctx->return_valid)
+                        if (ctx && ctx->return_valid) {
-                                result = (ctx->return_valid == AUDITSC_SUCCESS);
+                                if (value)
+                                        result = (ctx->return_valid == AUDITSC_SUCCESS);
+                                else
+                                        result = (ctx->return_valid == AUDITSC_FAILURE);
+                        }
                        break;
                case AUDIT_DEVMAJOR:
                        if (ctx) {
@@ -454,7 +514,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
        enum audit_state   state;
        rcu_read_lock();
-        list_for_each_entry_rcu(e, &audit_tsklist, list) {
+        list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
                if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
                        rcu_read_unlock();
                        return state;
@@ -474,20 +534,84 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
                                             struct list_head *list)
 {
        struct audit_entry *e;
+        enum audit_state state;
+        if (audit_pid && tsk->tgid == audit_pid)
+                return AUDIT_DISABLED;
+        rcu_read_lock();
+        if (!list_empty(list)) {
+                    int word = AUDIT_WORD(ctx->major);
+                    int bit  = AUDIT_BIT(ctx->major);
+                    list_for_each_entry_rcu(e, list, list) {
+                            if ((e->rule.mask[word] & bit) == bit
+                                && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+                                    rcu_read_unlock();
+                                    return state;
+                            }
+                    }
+        }
+        rcu_read_unlock();
+        return AUDIT_BUILD_CONTEXT;
+}
+static int audit_filter_user_rules(struct netlink_skb_parms *cb,
+                              struct audit_rule *rule,
+                              enum audit_state *state)
+{
+        int i;
+        for (i = 0; i < rule->field_count; i++) {
+                u32 field  = rule->fields[i] & ~AUDIT_NEGATE;
+                u32 value  = rule->values[i];
+                int result = 0;
+                switch (field) {
+                case AUDIT_PID:
+                        result = (cb->creds.pid == value);
+                        break;
+                case AUDIT_UID:
+                        result = (cb->creds.uid == value);
+                        break;
+                case AUDIT_GID:
+                        result = (cb->creds.gid == value);
+                        break;
+                case AUDIT_LOGINUID:
+                        result = (cb->loginuid == value);
+                        break;
+                }
+                if (rule->fields[i] & AUDIT_NEGATE)
+                        result = !result;
+                if (!result)
+                        return 0;
+        }
+        switch (rule->action) {
+        case AUDIT_NEVER:    *state = AUDIT_DISABLED;       break;
+        case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
+        case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+        }
+        return 1;
+}
+int audit_filter_user(struct netlink_skb_parms *cb, int type)
+{
+        struct audit_entry *e;
        enum audit_state   state;
-        int                word = AUDIT_WORD(ctx->major);
+        int ret = 1;
-        int                bit  = AUDIT_BIT(ctx->major);
        rcu_read_lock();
-        list_for_each_entry_rcu(e, list, list) {
+        list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
-                if ((e->rule.mask[word] & bit) == bit
+                if (audit_filter_user_rules(cb, &e->rule, &state)) {
-                    && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+                        if (state == AUDIT_DISABLED)
-                        rcu_read_unlock();
+                                ret = 0;
-                        return state;
+                        break;
                }
        }
        rcu_read_unlock();
-        return AUDIT_BUILD_CONTEXT;
+        return ret; /* Audit by default */
 }
 /* This should be called with task_lock() held. */
@@ -504,7 +628,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
        if (context->in_syscall && !context->auditable) {
                enum audit_state state;
-                state = audit_filter_syscall(tsk, context, &audit_extlist);
+                state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
                if (state == AUDIT_RECORD_CONTEXT)
                        context->auditable = 1;
        }
@@ -679,13 +803,13 @@ static void audit_log_task_info(struct audit_buffer *ab)
        up_read(&mm->mmap_sem);
 }
-static void audit_log_exit(struct audit_context *context)
+static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask)
 {
        int i;
        struct audit_buffer *ab;
        struct audit_aux_data *aux;
-        ab = audit_log_start(context, AUDIT_SYSCALL);
+        ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL);
        if (!ab)
                return;         /* audit_panic has been called */
        audit_log_format(ab, "arch=%x syscall=%d",
@@ -717,7 +841,7 @@ static void audit_log_exit(struct audit_context *context)
        for (aux = context->aux; aux; aux = aux->next) {
-                ab = audit_log_start(context, aux->type);
+                ab = audit_log_start(context, GFP_KERNEL, aux->type);
                if (!ab)
                        continue; /* audit_panic has been called */
@@ -754,14 +878,14 @@ static void audit_log_exit(struct audit_context *context)
        }
        if (context->pwd && context->pwdmnt) {
-                ab = audit_log_start(context, AUDIT_CWD);
+                ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
                if (ab) {
                        audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
                        audit_log_end(ab);
                }
        }
        for (i = 0; i < context->name_count; i++) {
-                ab = audit_log_start(context, AUDIT_PATH);
+                ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
                if (!ab)
                        continue; /* audit_panic has been called */
@@ -770,6 +894,8 @@ static void audit_log_exit(struct audit_context *context)
                        audit_log_format(ab, " name=");
                        audit_log_untrustedstring(ab, context->names[i].name);
                }
+                audit_log_format(ab, " flags=%x\n", context->names[i].flags);
+                         
                if (context->names[i].ino != (unsigned long)-1)
                        audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
                                             " ouid=%u ogid=%u rdev=%02x:%02x",
@@ -799,9 +925,11 @@ void audit_free(struct task_struct *tsk)
                return;
        /* Check for system calls that do not go through the exit
-         * function (e.g., exit_group), then free context block. */
+         * function (e.g., exit_group), then free context block. 
-        if (context->in_syscall && context->auditable && context->pid != audit_pid)
+         * We use GFP_ATOMIC here because we might be doing this 
-                audit_log_exit(context);
+         * in the context of the idle thread */
+        if (context->in_syscall && context->auditable)
+                audit_log_exit(context, GFP_ATOMIC);
        audit_free_context(context);
 }
@@ -876,11 +1004,11 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
        state = context->state;
        if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
-                state = audit_filter_syscall(tsk, context, &audit_entlist);
+                state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
        if (likely(state == AUDIT_DISABLED))
                return;
-        context->serial     = audit_serial();
+        context->serial     = 0;
        context->ctime      = CURRENT_TIME;
        context->in_syscall = 1;
        context->auditable  = !!(state == AUDIT_RECORD_CONTEXT);
@@ -903,10 +1031,10 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
        /* Not having a context here is ok, since the parent may have
         * called __put_task_struct. */
        if (likely(!context))
-                return;
+                goto out;
-        if (context->in_syscall && context->auditable && context->pid != audit_pid)
+        if (context->in_syscall && context->auditable)
-                audit_log_exit(context);
+                audit_log_exit(context, GFP_KERNEL);
        context->in_syscall = 0;
        context->auditable  = 0;
@@ -919,9 +1047,9 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
        } else {
                audit_free_names(context);
                audit_free_aux(context);
-                audit_zero_context(context, context->state);
                tsk->audit_context = context;
        }
+ out:
        put_task_struct(tsk);
 }
@@ -996,7 +1124,7 @@ void audit_putname(const char *name)
 /* Store the inode and device from a lookup.  Called from
 * fs/namei.c:path_lookup(). */
-void audit_inode(const char *name, const struct inode *inode)
+void audit_inode(const char *name, const struct inode *inode, unsigned flags)
 {
        int idx;
        struct audit_context *context = current->audit_context;
@@ -1022,17 +1150,20 @@ void audit_inode(const char *name, const struct inode *inode)
                ++context->ino_count;
 #endif
        }
-        context->names[idx].ino  = inode->i_ino;
+        context->names[idx].flags = flags;
-        context->names[idx].dev  = inode->i_sb->s_dev;
+        context->names[idx].ino   = inode->i_ino;
-        context->names[idx].mode = inode->i_mode;
+        context->names[idx].dev   = inode->i_sb->s_dev;
-        context->names[idx].uid  = inode->i_uid;
+        context->names[idx].mode  = inode->i_mode;
-        context->names[idx].gid  = inode->i_gid;
+        context->names[idx].uid   = inode->i_uid;
-        context->names[idx].rdev = inode->i_rdev;
+        context->names[idx].gid   = inode->i_gid;
+        context->names[idx].rdev  = inode->i_rdev;
 }
 void auditsc_get_stamp(struct audit_context *ctx,
                       struct timespec *t, unsigned int *serial)
 {
+        if (!ctx->serial)
+                ctx->serial = audit_serial();
        t->tv_sec  = ctx->ctime.tv_sec;
        t->tv_nsec = ctx->ctime.tv_nsec;
        *serial    = ctx->serial;
@@ -1044,7 +1175,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
        if (task->audit_context) {
                struct audit_buffer *ab;
-                ab = audit_log_start(NULL, AUDIT_LOGIN);
+                ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
                if (ab) {
                        audit_log_format(ab, "login pid=%d uid=%u "
                                "old auid=%u new auid=%u",
@@ -1153,7 +1284,7 @@ void audit_signal_info(int sig, struct task_struct *t)
        extern pid_t audit_sig_pid;
        extern uid_t audit_sig_uid;
-        if (unlikely(audit_pid && t->pid == audit_pid)) {
+        if (unlikely(audit_pid && t->tgid == audit_pid)) {
                if (sig == SIGTERM || sig == SIGHUP) {
                        struct audit_context *ctx = current->audit_context;
                        audit_sig_pid = current->pid;
diff --git a/kernel/compat.c b/kernel/compat.c
index ddfcaaa86623..102296e21ea8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
        if (!time_after(expire, now))
                return 0;
-        current->state = TASK_INTERRUPTIBLE;
+        expire = schedule_timeout_interruptible(expire - now);
-        expire = schedule_timeout(expire - now);
        if (expire == 0)
                return 0;
@@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
                return -EINVAL;
        expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
-        current->state = TASK_INTERRUPTIBLE;
+        expire = schedule_timeout_interruptible(expire);
-        expire = schedule_timeout(expire);
        if (expire == 0)
                return 0;
@@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
                        recalc_sigpending();
                        spin_unlock_irq(&current->sighand->siglock);
-                        current->state = TASK_INTERRUPTIBLE;
+                        timeout = schedule_timeout_interruptible(timeout);
-                        timeout = schedule_timeout(timeout);
                        spin_lock_irq(&current->sighand->siglock);
                        sig = dequeue_signal(current, &s, &info);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8ab1b4e518b8..28176d083f7b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -180,6 +180,42 @@ static struct super_block *cpuset_sb = NULL;
 */
 static DECLARE_MUTEX(cpuset_sem);
+static struct task_struct *cpuset_sem_owner;
+static int cpuset_sem_depth;
+/*
+ * The global cpuset semaphore cpuset_sem can be needed by the
+ * memory allocator to update a tasks mems_allowed (see the calls
+ * to cpuset_update_current_mems_allowed()) or to walk up the
+ * cpuset hierarchy to find a mem_exclusive cpuset see the calls
+ * to cpuset_excl_nodes_overlap()).
+ *
+ * But if the memory allocation is being done by cpuset.c code, it
+ * usually already holds cpuset_sem.  Double tripping on a kernel
+ * semaphore deadlocks the current task, and any other task that
+ * subsequently tries to obtain the lock.
+ *
+ * Run all up's and down's on cpuset_sem through the following
+ * wrappers, which will detect this nested locking, and avoid
+ * deadlocking.
+ */
+static inline void cpuset_down(struct semaphore *psem)
+{
+        if (cpuset_sem_owner != current) {
+                down(psem);
+                cpuset_sem_owner = current;
+        }
+        cpuset_sem_depth++;
+}
+static inline void cpuset_up(struct semaphore *psem)
+{
+        if (--cpuset_sem_depth == 0) {
+                cpuset_sem_owner = NULL;
+                up(psem);
+        }
+}
 /*
 * A couple of forward declarations required, due to cyclic reference loop:
@@ -522,19 +558,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 * Refresh current tasks mems_allowed and mems_generation from
 * current tasks cpuset.  Call with cpuset_sem held.
 *
- * Be sure to call refresh_mems() on any cpuset operation which
+ * This routine is needed to update the per-task mems_allowed
- * (1) holds cpuset_sem, and (2) might possibly alloc memory.
+ * data, within the tasks context, when it is trying to allocate
- * Call after obtaining cpuset_sem lock, before any possible
+ * memory (in various mm/mempolicy.c routines) and notices
- * allocation.  Otherwise one risks trying to allocate memory
+ * that some other task has been modifying its cpuset.
- * while the task cpuset_mems_generation is not the same as
- * the mems_generation in its cpuset, which would deadlock on
- * cpuset_sem in cpuset_update_current_mems_allowed().
- *
- * Since we hold cpuset_sem, once refresh_mems() is called, the
- * test (current->cpuset_mems_generation != cs->mems_generation)
- * in cpuset_update_current_mems_allowed() will remain false,
- * until we drop cpuset_sem.  Anyone else who would change our
- * cpusets mems_generation needs to lock cpuset_sem first.
 */
 static void refresh_mems(void)
@@ -628,13 +655,6 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
 */
-/*
- * Hack to avoid 2.6.13 partial node dynamic sched domain bug.
- * Disable letting 'cpu_exclusive' cpusets define dynamic sched
- * domains, until the sched domain can handle partial nodes.
- * Remove this #if hackery when sched domains fixed.
- */
-#if 0
 static void update_cpu_domains(struct cpuset *cur)
 {
        struct cpuset *c, *par = cur->parent;
@@ -675,11 +695,6 @@ static void update_cpu_domains(struct cpuset *cur)
        partition_sched_domains(&pspan, &cspan);
        unlock_cpu_hotplug();
 }
-#else
-static void update_cpu_domains(struct cpuset *cur)
-{
-}
-#endif
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
@@ -852,7 +867,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
        }
        buffer[nbytes] = 0;     /* nul-terminate */
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
        if (is_removed(cs)) {
                retval = -ENODEV;
@@ -886,7 +901,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
        if (retval == 0)
                retval = nbytes;
 out2:
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        cpuset_release_agent(pathbuf);
 out1:
        kfree(buffer);
@@ -926,9 +941,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
        cpumask_t mask;
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
        mask = cs->cpus_allowed;
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -937,9 +952,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
        nodemask_t mask;
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
        mask = cs->mems_allowed;
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -953,8 +968,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
        char *page;
        ssize_t retval = 0;
        char *s;
-        char *start;
-        size_t n;
        if (!(page = (char *)__get_free_page(GFP_KERNEL)))
                return -ENOMEM;
@@ -984,10 +997,7 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
        *s++ = '\n';
        *s = '\0';
-        start = page + *ppos;
+        retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
-        n = s - start;
-        retval = n - copy_to_user(buf, start, min(n, nbytes));
-        *ppos += retval;
 out:
        free_page((unsigned long)page);
        return retval;
@@ -1342,8 +1352,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
        if (!cs)
                return -ENOMEM;
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
-        refresh_mems();
        cs->flags = 0;
        if (notify_on_release(parent))
                set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1368,14 +1377,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
         * will down() this new directory's i_sem and if we race with
         * another mkdir, we might deadlock.
         */
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        err = cpuset_populate_dir(cs->dentry);
        /* If err < 0, we have a half-filled directory - oh well ;) */
        return 0;
 err:
        list_del(&cs->sibling);
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        kfree(cs);
        return err;
 }
@@ -1397,14 +1406,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
        /* the vfs holds both inode->i_sem already */
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
-        refresh_mems();
        if (atomic_read(&cs->count) > 0) {
-                up(&cpuset_sem);
+                cpuset_up(&cpuset_sem);
                return -EBUSY;
        }
        if (!list_empty(&cs->children)) {
-                up(&cpuset_sem);
+                cpuset_up(&cpuset_sem);
                return -EBUSY;
        }
        parent = cs->parent;
@@ -1420,7 +1428,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
        spin_unlock(&d->d_lock);
        cpuset_d_remove_dir(d);
        dput(d);
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        cpuset_release_agent(pathbuf);
        return 0;
 }
@@ -1523,10 +1531,10 @@ void cpuset_exit(struct task_struct *tsk)
        if (notify_on_release(cs)) {
                char *pathbuf = NULL;
-                down(&cpuset_sem);
+                cpuset_down(&cpuset_sem);
                if (atomic_dec_and_test(&cs->count))
                        check_for_release(cs, &pathbuf);
-                up(&cpuset_sem);
+                cpuset_up(&cpuset_sem);
                cpuset_release_agent(pathbuf);
        } else {
                atomic_dec(&cs->count);
@@ -1547,11 +1555,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
 {
        cpumask_t mask;
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
        task_lock((struct task_struct *)tsk);
        guarantee_online_cpus(tsk->cpuset, &mask);
        task_unlock((struct task_struct *)tsk);
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        return mask;
 }
@@ -1576,9 +1584,9 @@ void cpuset_update_current_mems_allowed(void)
        if (!cs)
                return;         /* task is exiting */
        if (current->cpuset_mems_generation != cs->mems_generation) {
-                down(&cpuset_sem);
+                cpuset_down(&cpuset_sem);
                refresh_mems();
-                up(&cpuset_sem);
+                cpuset_up(&cpuset_sem);
        }
 }
@@ -1611,17 +1619,114 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
        return 0;
 }
+/*
+ * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
+ * ancestor to the specified cpuset.  Call while holding cpuset_sem.
+ * If no ancestor is mem_exclusive (an unusual configuration), then
+ * returns the root cpuset.
+ */
+static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
+{
+        while (!is_mem_exclusive(cs) && cs->parent)
+                cs = cs->parent;
+        return cs;
+}
 /**
- * cpuset_zone_allowed - is zone z allowed in current->mems_allowed
+ * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
- * @z: zone in question
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
 *
- * Is zone z allowed in current->mems_allowed, or is
+ * If we're in interrupt, yes, we can always allocate.  If zone
- * the CPU in interrupt context? (zone is always allowed in this case)
+ * z's node is in our tasks mems_allowed, yes.  If it's not a
- */
+ * __GFP_HARDWALL request and this zone's nodes is in the nearest
-int cpuset_zone_allowed(struct zone *z)
+ * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * Otherwise, no.
+ *
+ * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
+ * and do not allow allocations outside the current tasks cpuset.
+ * GFP_KERNEL allocations are not so marked, so can escape to the
+ * nearest mem_exclusive ancestor cpuset.
+ *
+ * Scanning up parent cpusets requires cpuset_sem.  The __alloc_pages()
+ * routine only calls here with __GFP_HARDWALL bit _not_ set if
+ * it's a GFP_KERNEL allocation, and all nodes in the current tasks
+ * mems_allowed came up empty on the first pass over the zonelist.
+ * So only GFP_KERNEL allocations, if all nodes in the cpuset are
+ * short of memory, might require taking the cpuset_sem semaphore.
+ *
+ * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
+ * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
+ * hardwall cpusets - no allocation on a node outside the cpuset is
+ * allowed (unless in interrupt, of course).
+ *
+ * The second loop doesn't even call here for GFP_ATOMIC requests
+ * (if the __alloc_pages() local variable 'wait' is set).  That check
+ * and the checks below have the combined affect in the second loop of
+ * the __alloc_pages() routine that:
+ *      in_interrupt - any node ok (current task context irrelevant)
+ *      GFP_ATOMIC   - any node ok
+ *      GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
+ *      GFP_USER     - only nodes in current tasks mems allowed ok.
+ **/
+int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+        int node;                       /* node that zone z is on */
+        const struct cpuset *cs;        /* current cpuset ancestors */
+        int allowed = 1;                /* is allocation in zone z allowed? */
+        if (in_interrupt())
+                return 1;
+        node = z->zone_pgdat->node_id;
+        if (node_isset(node, current->mems_allowed))
+                return 1;
+        if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
+                return 0;
+        /* Not hardwall and node outside mems_allowed: scan up cpusets */
+        cpuset_down(&cpuset_sem);
+        cs = current->cpuset;
+        if (!cs)
+                goto done;              /* current task exiting */
+        cs = nearest_exclusive_ancestor(cs);
+        allowed = node_isset(node, cs->mems_allowed);
+done:
+        cpuset_up(&cpuset_sem);
+        return allowed;
+}
+/**
+ * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
+ * @p: pointer to task_struct of some other task.
+ *
+ * Description: Return true if the nearest mem_exclusive ancestor
+ * cpusets of tasks @p and current overlap.  Used by oom killer to
+ * determine if task @p's memory usage might impact the memory
+ * available to the current task.
+ *
+ * Acquires cpuset_sem - not suitable for calling from a fast path.
+ **/
+int cpuset_excl_nodes_overlap(const struct task_struct *p)
 {
-        return in_interrupt() ||
+        const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
-                node_isset(z->zone_pgdat->node_id, current->mems_allowed);
+        int overlap = 0;                /* do cpusets overlap? */
+        cpuset_down(&cpuset_sem);
+        cs1 = current->cpuset;
+        if (!cs1)
+                goto done;              /* current task exiting */
+        cs2 = p->cpuset;
+        if (!cs2)
+                goto done;              /* task p is exiting */
+        cs1 = nearest_exclusive_ancestor(cs1);
+        cs2 = nearest_exclusive_ancestor(cs2);
+        overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+done:
+        cpuset_up(&cpuset_sem);
+        return overlap;
 }
 /*
@@ -1642,7 +1747,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
                return -ENOMEM;
        tsk = m->private;
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
        task_lock(tsk);
        cs = tsk->cpuset;
        task_unlock(tsk);
@@ -1657,7 +1762,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
        seq_puts(m, buf);
        seq_putc(m, '\n');
 out:
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        kfree(buf);
        return retval;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 5b0fb9f09f21..3b25b182d2be 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -368,17 +368,25 @@ EXPORT_SYMBOL(daemonize);
 static inline void close_files(struct files_struct * files)
 {
        int i, j;
+        struct fdtable *fdt;
        j = 0;
+        /*
+         * It is safe to dereference the fd table without RCU or
+         * ->file_lock because this is the last reference to the
+         * files structure.
+         */
+        fdt = files_fdtable(files);
        for (;;) {
                unsigned long set;
                i = j * __NFDBITS;
-                if (i >= files->max_fdset || i >= files->max_fds)
+                if (i >= fdt->max_fdset || i >= fdt->max_fds)
                        break;
-                set = files->open_fds->fds_bits[j++];
+                set = fdt->open_fds->fds_bits[j++];
                while (set) {
                        if (set & 1) {
-                                struct file * file = xchg(&files->fd[i], NULL);
+                                struct file * file = xchg(&fdt->fd[i], NULL);
                                if (file)
                                        filp_close(file, files);
                        }
@@ -403,18 +411,22 @@ struct files_struct *get_files_struct(struct task_struct *task)
 void fastcall put_files_struct(struct files_struct *files)
 {
+        struct fdtable *fdt;
        if (atomic_dec_and_test(&files->count)) {
                close_files(files);
                /*
                 * Free the fd and fdset arrays if we expanded them.
+                 * If the fdtable was embedded, pass files for freeing
+                 * at the end of the RCU grace period. Otherwise,
+                 * you can free files immediately.
                 */
-                if (files->fd != &files->fd_array[0])
+                fdt = files_fdtable(files);
-                        free_fd_array(files->fd, files->max_fds);
+                if (fdt == &files->fdtab)
-                if (files->max_fdset > __FD_SETSIZE) {
+                        fdt->free_files = files;
-                        free_fdset(files->open_fds, files->max_fdset);
+                else
-                        free_fdset(files->close_on_exec, files->max_fdset);
+                        kmem_cache_free(files_cachep, files);
-                }
+                free_fdtable(fdt);
-                kmem_cache_free(files_cachep, files);
        }
 }
@@ -831,6 +843,7 @@ fastcall NORET_TYPE void do_exit(long code)
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                del_timer_sync(&tsk->signal->real_timer);
+                exit_itimers(tsk->signal);
                acct_process(code);
        }
        exit_mm(tsk);
@@ -1191,7 +1204,7 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
                exit_code = p->exit_code;
                if (unlikely(!exit_code) ||
-                    unlikely(p->state > TASK_STOPPED))
+                    unlikely(p->state & TASK_TRACED))
                        goto bail_ref;
                return wait_noreap_copyout(p, pid, uid,
                                           why, (exit_code << 8) | 0x7f,
diff --git a/kernel/fork.c b/kernel/fork.c
index 7e1ead9a6ba4..280bd44ac441 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -35,6 +35,7 @@
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
 #include <linux/futex.h>
+#include <linux/rcupdate.h>
 #include <linux/ptrace.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
@@ -176,6 +177,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        /* One for us, one for whoever does the "release_task()" (usually parent) */
        atomic_set(&tsk->usage,2);
+        atomic_set(&tsk->fs_excl, 0);
        return tsk;
 }
@@ -564,24 +566,53 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
        return 0;
 }
-static int count_open_files(struct files_struct *files, int size)
+static int count_open_files(struct fdtable *fdt)
 {
+        int size = fdt->max_fdset;
        int i;
        /* Find the last open fd */
        for (i = size/(8*sizeof(long)); i > 0; ) {
-                if (files->open_fds->fds_bits[--i])
+                if (fdt->open_fds->fds_bits[--i])
                        break;
        }
        i = (i+1) * 8 * sizeof(long);
        return i;
 }
+static struct files_struct *alloc_files(void)
+{
+        struct files_struct *newf;
+        struct fdtable *fdt;
+        newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+        if (!newf)
+                goto out;
+        atomic_set(&newf->count, 1);
+        spin_lock_init(&newf->file_lock);
+        fdt = &newf->fdtab;
+        fdt->next_fd = 0;
+        fdt->max_fds = NR_OPEN_DEFAULT;
+        fdt->max_fdset = __FD_SETSIZE;
+        fdt->close_on_exec = &newf->close_on_exec_init;
+        fdt->open_fds = &newf->open_fds_init;
+        fdt->fd = &newf->fd_array[0];
+        INIT_RCU_HEAD(&fdt->rcu);
+        fdt->free_files = NULL;
+        fdt->next = NULL;
+        rcu_assign_pointer(newf->fdt, fdt);
+out:
+        return newf;
+}
 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 {
        struct files_struct *oldf, *newf;
        struct file **old_fds, **new_fds;
        int open_files, size, i, error = 0, expand;
+        struct fdtable *old_fdt, *new_fdt;
        /*
         * A background process may not have any files ...
@@ -602,35 +633,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
         */
        tsk->files = NULL;
        error = -ENOMEM;
-        newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+        newf = alloc_files();
-        if (!newf) 
+        if (!newf)
                goto out;
-        atomic_set(&newf->count, 1);
-        spin_lock_init(&newf->file_lock);
-        newf->next_fd       = 0;
-        newf->max_fds       = NR_OPEN_DEFAULT;
-        newf->max_fdset     = __FD_SETSIZE;
-        newf->close_on_exec = &newf->close_on_exec_init;
-        newf->open_fds      = &newf->open_fds_init;
-        newf->fd            = &newf->fd_array[0];
        spin_lock(&oldf->file_lock);
+        old_fdt = files_fdtable(oldf);
-        open_files = count_open_files(oldf, oldf->max_fdset);
+        new_fdt = files_fdtable(newf);
+        size = old_fdt->max_fdset;
+        open_files = count_open_files(old_fdt);
        expand = 0;
        /*
         * Check whether we need to allocate a larger fd array or fd set.
         * Note: we're not a clone task, so the open count won't  change.
         */
-        if (open_files > newf->max_fdset) {
+        if (open_files > new_fdt->max_fdset) {
-                newf->max_fdset = 0;
+                new_fdt->max_fdset = 0;
                expand = 1;
        }
-        if (open_files > newf->max_fds) {
+        if (open_files > new_fdt->max_fds) {
-                newf->max_fds = 0;
+                new_fdt->max_fds = 0;
                expand = 1;
        }
@@ -642,14 +665,21 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
                spin_unlock(&newf->file_lock);
                if (error < 0)
                        goto out_release;
+                new_fdt = files_fdtable(newf);
+                /*
+                 * Reacquire the oldf lock and a pointer to its fd table
+                 * who knows it may have a new bigger fd table. We need
+                 * the latest pointer.
+                 */
                spin_lock(&oldf->file_lock);
+                old_fdt = files_fdtable(oldf);
        }
-        old_fds = oldf->fd;
+        old_fds = old_fdt->fd;
-        new_fds = newf->fd;
+        new_fds = new_fdt->fd;
-        memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
+        memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
-        memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
+        memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
        for (i = open_files; i != 0; i--) {
                struct file *f = *old_fds++;
@@ -662,24 +692,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
                         * is partway through open().  So make sure that this
                         * fd is available to the new process.
                         */
-                        FD_CLR(open_files - i, newf->open_fds);
+                        FD_CLR(open_files - i, new_fdt->open_fds);
                }
-                *new_fds++ = f;
+                rcu_assign_pointer(*new_fds++, f);
        }
        spin_unlock(&oldf->file_lock);
        /* compute the remainder to be cleared */
-        size = (newf->max_fds - open_files) * sizeof(struct file *);
+        size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
        /* This is long word aligned thus could use a optimized version */ 
        memset(new_fds, 0, size); 
-        if (newf->max_fdset > open_files) {
+        if (new_fdt->max_fdset > open_files) {
-                int left = (newf->max_fdset-open_files)/8;
+                int left = (new_fdt->max_fdset-open_files)/8;
                int start = open_files / (8 * sizeof(unsigned long));
-                memset(&newf->open_fds->fds_bits[start], 0, left);
+                memset(&new_fdt->open_fds->fds_bits[start], 0, left);
-                memset(&newf->close_on_exec->fds_bits[start], 0, left);
+                memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
        }
        tsk->files = newf;
@@ -688,9 +718,9 @@ out:
        return error;
 out_release:
-        free_fdset (newf->close_on_exec, newf->max_fdset);
+        free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
-        free_fdset (newf->open_fds, newf->max_fdset);
+        free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
-        free_fd_array(newf->fd, newf->max_fds);
+        free_fd_array(new_fdt->fd, new_fdt->max_fds);
        kmem_cache_free(files_cachep, newf);
        goto out;
 }
@@ -818,7 +848,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long new_flags = p->flags;
-        new_flags &= ~PF_SUPERPRIV;
+        new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
        new_flags |= PF_FORKNOEXEC;
        if (!(clone_flags & CLONE_PTRACE))
                p->ptrace = 0;
@@ -1032,7 +1062,8 @@ static task_t *copy_process(unsigned long clone_flags,
         * parent's CPU). This avoids alot of nasty races.
         */
        p->cpus_allowed = current->cpus_allowed;
-        if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed)))
+        if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
+                        !cpu_online(task_cpu(p))))
                set_task_cpu(p, smp_processor_id());
        /*
@@ -1115,6 +1146,9 @@ static task_t *copy_process(unsigned long clone_flags,
                        __get_cpu_var(process_counts)++;
        }
+        if (!current->signal->tty && p->signal->tty)
+                p->signal->tty = NULL;
        nr_threads++;
        total_forks++;
        write_unlock_irq(&tasklist_lock);
diff --git a/kernel/futex.c b/kernel/futex.c
index c7130f86106c..ca05fe6a70b2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -40,6 +40,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <asm/futex.h>
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
@@ -327,6 +328,118 @@ out:
 }
 /*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+{
+        union futex_key key1, key2;
+        struct futex_hash_bucket *bh1, *bh2;
+        struct list_head *head;
+        struct futex_q *this, *next;
+        int ret, op_ret, attempt = 0;
+retryfull:
+        down_read(&current->mm->mmap_sem);
+        ret = get_futex_key(uaddr1, &key1);
+        if (unlikely(ret != 0))
+                goto out;
+        ret = get_futex_key(uaddr2, &key2);
+        if (unlikely(ret != 0))
+                goto out;
+        bh1 = hash_futex(&key1);
+        bh2 = hash_futex(&key2);
+retry:
+        if (bh1 < bh2)
+                spin_lock(&bh1->lock);
+        spin_lock(&bh2->lock);
+        if (bh1 > bh2)
+                spin_lock(&bh1->lock);
+        op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+        if (unlikely(op_ret < 0)) {
+                int dummy;
+                spin_unlock(&bh1->lock);
+                if (bh1 != bh2)
+                        spin_unlock(&bh2->lock);
+                /* futex_atomic_op_inuser needs to both read and write
+                 * *(int __user *)uaddr2, but we can't modify it
+                 * non-atomically.  Therefore, if get_user below is not
+                 * enough, we need to handle the fault ourselves, while
+                 * still holding the mmap_sem.  */
+                if (attempt++) {
+                        struct vm_area_struct * vma;
+                        struct mm_struct *mm = current->mm;
+                        ret = -EFAULT;
+                        if (attempt >= 2 ||
+                            !(vma = find_vma(mm, uaddr2)) ||
+                            vma->vm_start > uaddr2 ||
+                            !(vma->vm_flags & VM_WRITE))
+                                goto out;
+                        switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
+                        case VM_FAULT_MINOR:
+                                current->min_flt++;
+                                break;
+                        case VM_FAULT_MAJOR:
+                                current->maj_flt++;
+                                break;
+                        default:
+                                goto out;
+                        }
+                        goto retry;
+                }
+                /* If we would have faulted, release mmap_sem,
+                 * fault it in and start all over again.  */
+                up_read(&current->mm->mmap_sem);
+                ret = get_user(dummy, (int __user *)uaddr2);
+                if (ret)
+                        return ret;
+                goto retryfull;
+        }
+        head = &bh1->chain;
+        list_for_each_entry_safe(this, next, head, list) {
+                if (match_futex (&this->key, &key1)) {
+                        wake_futex(this);
+                        if (++ret >= nr_wake)
+                                break;
+                }
+        }
+        if (op_ret > 0) {
+                head = &bh2->chain;
+                op_ret = 0;
+                list_for_each_entry_safe(this, next, head, list) {
+                        if (match_futex (&this->key, &key2)) {
+                                wake_futex(this);
+                                if (++op_ret >= nr_wake2)
+                                        break;
+                        }
+                }
+                ret += op_ret;
+        }
+        spin_unlock(&bh1->lock);
+        if (bh1 != bh2)
+                spin_unlock(&bh2->lock);
+out:
+        up_read(&current->mm->mmap_sem);
+        return ret;
+}
+/*
 * Requeue all waiters hashed on one physical page to another
 * physical page.
 */
@@ -673,23 +786,17 @@ static int futex_fd(unsigned long uaddr, int signal)
        filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
        if (signal) {
-                int err;
                err = f_setown(filp, current->pid, 1);
                if (err < 0) {
-                        put_unused_fd(ret);
+                        goto error;
-                        put_filp(filp);
-                        ret = err;
-                        goto out;
                }
                filp->f_owner.signum = signal;
        }
        q = kmalloc(sizeof(*q), GFP_KERNEL);
        if (!q) {
-                put_unused_fd(ret);
+                err = -ENOMEM;
-                put_filp(filp);
+                goto error;
-                ret = -ENOMEM;
-                goto out;
        }
        down_read(&current->mm->mmap_sem);
@@ -697,10 +804,8 @@ static int futex_fd(unsigned long uaddr, int signal)
        if (unlikely(err != 0)) {
                up_read(&current->mm->mmap_sem);
-                put_unused_fd(ret);
-                put_filp(filp);
                kfree(q);
-                return err;
+                goto error;
        }
        /*
@@ -716,6 +821,11 @@ static int futex_fd(unsigned long uaddr, int signal)
        fd_install(ret, filp);
 out:
        return ret;
+error:
+        put_unused_fd(ret);
+        put_filp(filp);
+        ret = err;
+        goto out;
 }
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
@@ -740,6 +850,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
        case FUTEX_CMP_REQUEUE:
                ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
                break;
+        case FUTEX_WAKE_OP:
+                ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+                break;
        default:
                ret = -ENOSYS;
        }
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
index 388977f3e9b7..0cbe633420fb 100644
--- a/kernel/intermodule.c
+++ b/kernel/intermodule.c
@@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void
        struct list_head *tmp;
        struct inter_module_entry *ime, *ime_new;
-        if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
+        if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
                /* Overloaded kernel, not fatal */
                printk(KERN_ERR
                        "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
@@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void
                kmalloc_failed = 1;
                return;
        }
-        memset(ime_new, 0, sizeof(*ime_new));
        ime_new->im_name = im_name;
        ime_new->owner = owner;
        ime_new->userdata = userdata;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c29f83c16497..3ff7b925c387 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
        unsigned int status;
        kstat_this_cpu.irqs[irq]++;
-        if (desc->status & IRQ_PER_CPU) {
+        if (CHECK_IRQ_PER_CPU(desc->status)) {
                irqreturn_t action_ret;
                /*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac6700985705..1cfdb08ddf20 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,10 @@
 cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
+#endif
 /**
 *      synchronize_irq - wait for pending IRQ handlers (on other CPUs)
 *
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 85d08daa6600..f26e534c6585 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
 */
 static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
-void __attribute__((weak))
+#ifdef CONFIG_GENERIC_PENDING_IRQ
-proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+        /*
+         * Save these away for later use. Re-progam when the
+         * interrupt is pending
+         */
+        set_pending_irq(irq, mask_val);
+}
+#else
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
        irq_affinity[irq] = mask_val;
        irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
+#endif
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
                                  int count, int *eof, void *data)
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 179baafcdd96..64ab045c3d9d 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -36,7 +36,7 @@
 * struct kfifo with kfree().
 */
 struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
-                         unsigned int __nocast gfp_mask, spinlock_t *lock)
+                         gfp_t gfp_mask, spinlock_t *lock)
 {
        struct kfifo *fifo;
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(kfifo_init);
 *
 * The size will be rounded-up to a power of 2.
 */
-struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock)
+struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
 {
        unsigned char *buffer;
        struct kfifo *ret;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b0237122b24e..f3ea492ab44d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -37,6 +37,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
 #include <asm/kdebug.h>
@@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages;
 * get_insn_slot() - Find a slot on an executable page for an instruction.
 * We allocate an executable page if there's no room on existing ones.
 */
-kprobe_opcode_t *get_insn_slot(void)
+kprobe_opcode_t __kprobes *get_insn_slot(void)
 {
        struct kprobe_insn_page *kip;
        struct hlist_node *pos;
@@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void)
        return kip->insns;
 }
-void free_insn_slot(kprobe_opcode_t *slot)
+void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 {
        struct kprobe_insn_page *kip;
        struct hlist_node *pos;
@@ -152,20 +153,42 @@ void free_insn_slot(kprobe_opcode_t *slot)
 }
 /* Locks kprobe: irqs must be disabled */
-void lock_kprobes(void)
+void __kprobes lock_kprobes(void)
 {
+        unsigned long flags = 0;
+        /* Avoiding local interrupts to happen right after we take the kprobe_lock
+         * and before we get a chance to update kprobe_cpu, this to prevent
+         * deadlock when we have a kprobe on ISR routine and a kprobe on task
+         * routine
+         */
+        local_irq_save(flags);
        spin_lock(&kprobe_lock);
        kprobe_cpu = smp_processor_id();
+        local_irq_restore(flags);
 }
-void unlock_kprobes(void)
+void __kprobes unlock_kprobes(void)
 {
+        unsigned long flags = 0;
+        /* Avoiding local interrupts to happen right after we update
+         * kprobe_cpu and before we get a a chance to release kprobe_lock,
+         * this to prevent deadlock when we have a kprobe on ISR routine and
+         * a kprobe on task routine
+         */
+        local_irq_save(flags);
        kprobe_cpu = NR_CPUS;
        spin_unlock(&kprobe_lock);
+        local_irq_restore(flags);
 }
 /* You have to be holding the kprobe_lock */
-struct kprobe *get_kprobe(void *addr)
+struct kprobe __kprobes *get_kprobe(void *addr)
 {
        struct hlist_head *head;
        struct hlist_node *node;
@@ -183,7 +206,7 @@ struct kprobe *get_kprobe(void *addr)
 * Aggregate handlers for multiple kprobes support - these handlers
 * take care of invoking the individual kprobe handlers on p->list
 */
-static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
        struct kprobe *kp;
@@ -198,8 +221,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
        return 0;
 }
-static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
-                              unsigned long flags)
+                                        unsigned long flags)
 {
        struct kprobe *kp;
@@ -213,8 +236,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
        return;
 }
-static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
-                              int trapnr)
+                                        int trapnr)
 {
        /*
         * if we faulted "during" the execution of a user specified
@@ -227,7 +250,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
        return 0;
 }
-static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
        struct kprobe *kp = curr_kprobe;
        if (curr_kprobe && kp->break_handler) {
@@ -240,7 +263,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
        return 0;
 }
-struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
+struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
 {
        struct hlist_node *node;
        struct kretprobe_instance *ri;
@@ -249,7 +272,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
        return NULL;
 }
-static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
+static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
+                                                              *rp)
 {
        struct hlist_node *node;
        struct kretprobe_instance *ri;
@@ -258,7 +282,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
        return NULL;
 }
-void add_rp_inst(struct kretprobe_instance *ri)
+void __kprobes add_rp_inst(struct kretprobe_instance *ri)
 {
        /*
         * Remove rp inst off the free list -
@@ -276,7 +300,7 @@ void add_rp_inst(struct kretprobe_instance *ri)
        hlist_add_head(&ri->uflist, &ri->rp->used_instances);
 }
-void recycle_rp_inst(struct kretprobe_instance *ri)
+void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
 {
        /* remove rp inst off the rprobe_inst_table */
        hlist_del(&ri->hlist);
@@ -291,7 +315,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri)
                kfree(ri);
 }
-struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
+struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
 {
        return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
 }
@@ -302,7 +326,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
 * instances associated with this task. These left over instances represent
 * probed functions that have been called but will never return.
 */
-void kprobe_flush_task(struct task_struct *tk)
+void __kprobes kprobe_flush_task(struct task_struct *tk)
 {
        struct kretprobe_instance *ri;
        struct hlist_head *head;
@@ -322,7 +346,8 @@ void kprobe_flush_task(struct task_struct *tk)
 * This kprobe pre_handler is registered with every kretprobe. When probe
 * hits it will set up the return probe.
 */
-static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+                                           struct pt_regs *regs)
 {
        struct kretprobe *rp = container_of(p, struct kretprobe, kp);
@@ -353,7 +378,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 * Add the new probe to old_p->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
-static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
 {
        struct kprobe *kp;
@@ -395,7 +420,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 * the intricacies
 * TODO: Move kcalloc outside the spinlock
 */
-static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+                                          struct kprobe *p)
 {
        int ret = 0;
        struct kprobe *ap;
@@ -434,15 +460,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
                spin_unlock_irqrestore(&kprobe_lock, flags);
 }
-int register_kprobe(struct kprobe *p)
+static int __kprobes in_kprobes_functions(unsigned long addr)
+{
+        if (addr >= (unsigned long)__kprobes_text_start
+                && addr < (unsigned long)__kprobes_text_end)
+                return -EINVAL;
+        return 0;
+}
+int __kprobes register_kprobe(struct kprobe *p)
 {
        int ret = 0;
        unsigned long flags = 0;
        struct kprobe *old_p;
-        if ((ret = arch_prepare_kprobe(p)) != 0) {
+        if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
+                return ret;
+        if ((ret = arch_prepare_kprobe(p)) != 0)
                goto rm_kprobe;
-        }
        spin_lock_irqsave(&kprobe_lock, flags);
        old_p = get_kprobe(p->addr);
        p->nmissed = 0;
@@ -466,7 +502,7 @@ rm_kprobe:
        return ret;
 }
-void unregister_kprobe(struct kprobe *p)
+void __kprobes unregister_kprobe(struct kprobe *p)
 {
        unsigned long flags;
        struct kprobe *old_p;
@@ -487,7 +523,7 @@ static struct notifier_block kprobe_exceptions_nb = {
        .priority = 0x7fffffff /* we need to notified first */
 };
-int register_jprobe(struct jprobe *jp)
+int __kprobes register_jprobe(struct jprobe *jp)
 {
        /* Todo: Verify probepoint is a function entry point */
        jp->kp.pre_handler = setjmp_pre_handler;
@@ -496,14 +532,14 @@ int register_jprobe(struct jprobe *jp)
        return register_kprobe(&jp->kp);
 }
-void unregister_jprobe(struct jprobe *jp)
+void __kprobes unregister_jprobe(struct jprobe *jp)
 {
        unregister_kprobe(&jp->kp);
 }
 #ifdef ARCH_SUPPORTS_KRETPROBES
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
 {
        int ret = 0;
        struct kretprobe_instance *inst;
@@ -540,14 +576,14 @@ int register_kretprobe(struct kretprobe *rp)
 #else /* ARCH_SUPPORTS_KRETPROBES */
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
 {
        return -ENOSYS;
 }
 #endif /* ARCH_SUPPORTS_KRETPROBES */
-void unregister_kretprobe(struct kretprobe *rp)
+void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
        unsigned long flags;
        struct kretprobe_instance *ri;
diff --git a/kernel/module.c b/kernel/module.c
index c32995fbd8fd..ff5c500ab625 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/init.h>
+#include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/elf.h>
@@ -498,7 +499,7 @@ static inline int try_force(unsigned int flags)
 {
        int ret = (flags & O_TRUNC);
        if (ret)
-                tainted |= TAINT_FORCED_MODULE;
+                add_taint(TAINT_FORCED_MODULE);
        return ret;
 }
 #else
@@ -897,7 +898,7 @@ static int check_version(Elf_Shdr *sechdrs,
        if (!(tainted & TAINT_FORCED_MODULE)) {
                printk("%s: no version for \"%s\" found: kernel tainted.\n",
                       mod->name, symname);
-                tainted |= TAINT_FORCED_MODULE;
+                add_taint(TAINT_FORCED_MODULE);
        }
        return 1;
 }
@@ -1352,7 +1353,7 @@ static void set_license(struct module *mod, const char *license)
        if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {
                printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
                       mod->name, license);
-                tainted |= TAINT_PROPRIETARY_MODULE;
+                add_taint(TAINT_PROPRIETARY_MODULE);
        }
 }
@@ -1509,6 +1510,7 @@ static struct module *load_module(void __user *umod,
        long err = 0;
        void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
        struct exception_table_entry *extable;
+        mm_segment_t old_fs;
        DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
               umod, len, uargs);
@@ -1609,7 +1611,7 @@ static struct module *load_module(void __user *umod,
        modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
        /* This is allowed: modprobe --force will invalidate it. */
        if (!modmagic) {
-                tainted |= TAINT_FORCED_MODULE;
+                add_taint(TAINT_FORCED_MODULE);
                printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
                       mod->name);
        } else if (!same_magic(modmagic, vermagic)) {
@@ -1738,7 +1740,7 @@ static struct module *load_module(void __user *umod,
            (mod->num_gpl_syms && !gplcrcindex)) {
                printk(KERN_WARNING "%s: No versions for exported symbols."
                       " Tainting kernel.\n", mod->name);
-                tainted |= TAINT_FORCED_MODULE;
+                add_taint(TAINT_FORCED_MODULE);
        }
 #endif
@@ -1779,6 +1781,24 @@ static struct module *load_module(void __user *umod,
        if (err < 0)
                goto cleanup;
+        /* flush the icache in correct context */
+        old_fs = get_fs();
+        set_fs(KERNEL_DS);
+        /*
+         * Flush the instruction cache, since we've played with text.
+         * Do it before processing of module parameters, so the module
+         * can provide parameter accessor functions of its own.
+         */
+        if (mod->module_init)
+                flush_icache_range((unsigned long)mod->module_init,
+                                   (unsigned long)mod->module_init
+                                   + mod->init_size);
+        flush_icache_range((unsigned long)mod->module_core,
+                           (unsigned long)mod->module_core + mod->core_size);
+        set_fs(old_fs);
        mod->args = args;
        if (obsparmindex) {
                err = obsolete_params(mod->name, mod->args,
@@ -1860,7 +1880,6 @@ sys_init_module(void __user *umod,
                const char __user *uargs)
 {
        struct module *mod;
-        mm_segment_t old_fs = get_fs();
        int ret = 0;
        /* Must have permission */
@@ -1878,19 +1897,6 @@ sys_init_module(void __user *umod,
                return PTR_ERR(mod);
        }
-        /* flush the icache in correct context */
-        set_fs(KERNEL_DS);
-        /* Flush the instruction cache, since we've played with text */
-        if (mod->module_init)
-                flush_icache_range((unsigned long)mod->module_init,
-                                   (unsigned long)mod->module_init
-                                   + mod->init_size);
-        flush_icache_range((unsigned long)mod->module_core,
-                           (unsigned long)mod->module_core + mod->core_size);
-        set_fs(old_fs);
        /* Now sew it into the lists.  They won't access us, since
           strong_try_module_get() will fail. */
        stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/params.c b/kernel/params.c
index d586c35ef8fc..1a8614bac5d5 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -80,8 +80,6 @@ static char *next_arg(char *args, char **param, char **val)
        int in_quote = 0, quoted = 0;
        char *next;
-        /* Chew any extra spaces */
-        while (*args == ' ') args++;
        if (*args == '"') {
                args++;
                in_quote = 1;
@@ -121,6 +119,10 @@ static char *next_arg(char *args, char **param, char **val)
                next = args + i + 1;
        } else
                next = args + i;
+        /* Chew up trailing spaces. */
+        while (*next == ' ')
+                next++;
        return next;
 }
@@ -135,6 +137,10 @@ int parse_args(const char *name,
        DEBUGP("Parsing ARGS: %s\n", args);
+        /* Chew leading spaces */
+        while (*args == ' ')
+                args++;
        while (*args) {
                int ret;
@@ -542,8 +548,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
 {
        struct module_kobject *mk;
-        mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL);
+        mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
-        memset(mk, 0, sizeof(struct module_kobject));
+        BUG_ON(!mk);
        mk->mod = THIS_MODULE;
        kobj_set_kset_s(mk, module_subsys);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ad85d3f0dcc4..bf374fceb39c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -91,7 +91,7 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
 * Update expiry time from increment, and increase overrun count,
 * given the current clock sample.
 */
-static inline void bump_cpu_timer(struct k_itimer *timer,
+static void bump_cpu_timer(struct k_itimer *timer,
                                  union cpu_time_count now)
 {
        int i;
@@ -110,7 +110,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
                for (i = 0; incr < delta - incr; i++)
                        incr = incr << 1;
                for (; i >= 0; incr >>= 1, i--) {
-                        if (delta <= incr)
+                        if (delta < incr)
                                continue;
                        timer->it.cpu.expires.sched += incr;
                        timer->it_overrun += 1 << i;
@@ -128,7 +128,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
                for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
                             incr = cputime_add(incr, incr);
                for (; i >= 0; incr = cputime_halve(incr), i--) {
-                        if (cputime_le(delta, incr))
+                        if (cputime_lt(delta, incr))
                                continue;
                        timer->it.cpu.expires.cpu =
                                cputime_add(timer->it.cpu.expires.cpu, incr);
@@ -380,14 +380,9 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
 int posix_cpu_timer_del(struct k_itimer *timer)
 {
        struct task_struct *p = timer->it.cpu.task;
+        int ret = 0;
-        if (timer->it.cpu.firing)
+        if (likely(p != NULL)) {
-                return TIMER_RETRY;
-        if (unlikely(p == NULL))
-                return 0;
-        if (!list_empty(&timer->it.cpu.entry)) {
                read_lock(&tasklist_lock);
                if (unlikely(p->signal == NULL)) {
                        /*
@@ -396,18 +391,20 @@ int posix_cpu_timer_del(struct k_itimer *timer)
                         */
                        BUG_ON(!list_empty(&timer->it.cpu.entry));
                } else {
-                        /*
-                         * Take us off the task's timer list.
-                         */
                        spin_lock(&p->sighand->siglock);
-                        list_del(&timer->it.cpu.entry);
+                        if (timer->it.cpu.firing)
+                                ret = TIMER_RETRY;
+                        else
+                                list_del(&timer->it.cpu.entry);
                        spin_unlock(&p->sighand->siglock);
                }
                read_unlock(&tasklist_lock);
+                if (!ret)
+                        put_task_struct(p);
        }
-        put_task_struct(p);
-        return 0;
+        return ret;
 }
 /*
@@ -424,7 +421,6 @@ static void cleanup_timers(struct list_head *head,
        cputime_t ptime = cputime_add(utime, stime);
        list_for_each_entry_safe(timer, next, head, entry) {
-                timer->task = NULL;
                list_del_init(&timer->entry);
                if (cputime_lt(timer->expires.cpu, ptime)) {
                        timer->expires.cpu = cputime_zero;
@@ -436,7 +432,6 @@ static void cleanup_timers(struct list_head *head,
        ++head;
        list_for_each_entry_safe(timer, next, head, entry) {
-                timer->task = NULL;
                list_del_init(&timer->entry);
                if (cputime_lt(timer->expires.cpu, utime)) {
                        timer->expires.cpu = cputime_zero;
@@ -448,7 +443,6 @@ static void cleanup_timers(struct list_head *head,
        ++head;
        list_for_each_entry_safe(timer, next, head, entry) {
-                timer->task = NULL;
                list_del_init(&timer->entry);
                if (timer->expires.sched < sched_time) {
                        timer->expires.sched = 0;
@@ -492,6 +486,9 @@ static void process_timer_rebalance(struct task_struct *p,
        struct task_struct *t = p;
        unsigned int nthreads = atomic_read(&p->signal->live);
+        if (!nthreads)
+                return;
        switch (clock_idx) {
        default:
                BUG();
@@ -500,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p,
                left = cputime_div(cputime_sub(expires.cpu, val.cpu),
                                   nthreads);
                do {
-                        if (!unlikely(t->exit_state)) {
+                        if (!unlikely(t->flags & PF_EXITING)) {
                                ticks = cputime_add(prof_ticks(t), left);
                                if (cputime_eq(t->it_prof_expires,
                                               cputime_zero) ||
@@ -515,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p,
                left = cputime_div(cputime_sub(expires.cpu, val.cpu),
                                   nthreads);
                do {
-                        if (!unlikely(t->exit_state)) {
+                        if (!unlikely(t->flags & PF_EXITING)) {
                                ticks = cputime_add(virt_ticks(t), left);
                                if (cputime_eq(t->it_virt_expires,
                                               cputime_zero) ||
@@ -530,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p,
                nsleft = expires.sched - val.sched;
                do_div(nsleft, nthreads);
                do {
-                        if (!unlikely(t->exit_state)) {
+                        if (!unlikely(t->flags & PF_EXITING)) {
                                ns = t->sched_time + nsleft;
                                if (t->it_sched_expires == 0 ||
                                    t->it_sched_expires > ns) {
@@ -569,6 +566,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
        struct cpu_timer_list *next;
        unsigned long i;
+        if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING))
+                return;
        head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
                p->cpu_timers : p->signal->cpu_timers);
        head += CPUCLOCK_WHICH(timer->it_clock);
@@ -579,17 +579,15 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
        listpos = head;
        if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
                list_for_each_entry(next, head, entry) {
-                        if (next->expires.sched > nt->expires.sched) {
+                        if (next->expires.sched > nt->expires.sched)
-                                listpos = &next->entry;
                                break;
-                        }
+                        listpos = &next->entry;
                }
        } else {
                list_for_each_entry(next, head, entry) {
-                        if (cputime_gt(next->expires.cpu, nt->expires.cpu)) {
+                        if (cputime_gt(next->expires.cpu, nt->expires.cpu))
-                                listpos = &next->entry;
                                break;
-                        }
+                        listpos = &next->entry;
                }
        }
        list_add(&nt->entry, listpos);
@@ -733,9 +731,15 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
         * Disarm any old timer after extracting its expiry time.
         */
        BUG_ON(!irqs_disabled());
+        ret = 0;
        spin_lock(&p->sighand->siglock);
        old_expires = timer->it.cpu.expires;
-        list_del_init(&timer->it.cpu.entry);
+        if (unlikely(timer->it.cpu.firing)) {
+                timer->it.cpu.firing = -1;
+                ret = TIMER_RETRY;
+        } else
+                list_del_init(&timer->it.cpu.entry);
        spin_unlock(&p->sighand->siglock);
        /*
@@ -783,7 +787,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                }
        }
-        if (unlikely(timer->it.cpu.firing)) {
+        if (unlikely(ret)) {
                /*
                 * We are colliding with the timer actually firing.
                 * Punt after filling in the timer's old value, and
@@ -791,8 +795,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                 * it as an overrun (thanks to bump_cpu_timer above).
                 */
                read_unlock(&tasklist_lock);
-                timer->it.cpu.firing = -1;
-                ret = TIMER_RETRY;
                goto out;
        }
@@ -958,14 +960,16 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 static void check_thread_timers(struct task_struct *tsk,
                                struct list_head *firing)
 {
+        int maxfire;
        struct list_head *timers = tsk->cpu_timers;
+        maxfire = 20;
        tsk->it_prof_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+                if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
                        tsk->it_prof_expires = t->expires.cpu;
                        break;
                }
@@ -974,12 +978,13 @@ static void check_thread_timers(struct task_struct *tsk,
        }
        ++timers;
+        maxfire = 20;
        tsk->it_virt_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+                if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
                        tsk->it_virt_expires = t->expires.cpu;
                        break;
                }
@@ -988,12 +993,13 @@ static void check_thread_timers(struct task_struct *tsk,
        }
        ++timers;
+        maxfire = 20;
        tsk->it_sched_expires = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (tsk->sched_time < t->expires.sched) {
+                if (!--maxfire || tsk->sched_time < t->expires.sched) {
                        tsk->it_sched_expires = t->expires.sched;
                        break;
                }
@@ -1010,6 +1016,7 @@ static void check_thread_timers(struct task_struct *tsk,
 static void check_process_timers(struct task_struct *tsk,
                                 struct list_head *firing)
 {
+        int maxfire;
        struct signal_struct *const sig = tsk->signal;
        cputime_t utime, stime, ptime, virt_expires, prof_expires;
        unsigned long long sched_time, sched_expires;
@@ -1042,12 +1049,13 @@ static void check_process_timers(struct task_struct *tsk,
        } while (t != tsk);
        ptime = cputime_add(utime, stime);
+        maxfire = 20;
        prof_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (cputime_lt(ptime, t->expires.cpu)) {
+                if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
                        prof_expires = t->expires.cpu;
                        break;
                }
@@ -1056,12 +1064,13 @@ static void check_process_timers(struct task_struct *tsk,
        }
        ++timers;
+        maxfire = 20;
        virt_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (cputime_lt(utime, t->expires.cpu)) {
+                if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
                        virt_expires = t->expires.cpu;
                        break;
                }
@@ -1070,12 +1079,13 @@ static void check_process_timers(struct task_struct *tsk,
        }
        ++timers;
+        maxfire = 20;
        sched_expires = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (sched_time < t->expires.sched) {
+                if (!--maxfire || sched_time < t->expires.sched) {
                        sched_expires = t->expires.sched;
                        break;
                }
@@ -1158,6 +1168,9 @@ static void check_process_timers(struct task_struct *tsk,
                unsigned long long sched_left, sched;
                const unsigned int nthreads = atomic_read(&sig->live);
+                if (!nthreads)
+                        return;
                prof_left = cputime_sub(prof_expires, utime);
                prof_left = cputime_sub(prof_left, stime);
                prof_left = cputime_div(prof_left, nthreads);
@@ -1194,7 +1207,7 @@ static void check_process_timers(struct task_struct *tsk,
                        do {
                                t = next_thread(t);
-                        } while (unlikely(t->exit_state));
+                        } while (unlikely(t->flags & PF_EXITING));
                } while (t != tsk);
        }
 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 38798a2ff994..dda3cda73c77 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
        timr->sigq->info.si_code = SI_TIMER;
        timr->sigq->info.si_tid = timr->it_id;
        timr->sigq->info.si_value = timr->it_sigev_value;
        if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
-                if (unlikely(timr->it_process->flags & PF_EXITING)) {
+                struct task_struct *leader;
-                        timr->it_sigev_notify = SIGEV_SIGNAL;
+                int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
-                        put_task_struct(timr->it_process);
+                                        timr->it_process);
-                        timr->it_process = timr->it_process->group_leader;
-                        goto group;
+                if (likely(ret >= 0))
-                }
+                        return ret;
-                return send_sigqueue(timr->it_sigev_signo, timr->sigq,
-                        timr->it_process);
+                timr->it_sigev_notify = SIGEV_SIGNAL;
-        }
+                leader = timr->it_process->group_leader;
-        else {
+                put_task_struct(timr->it_process);
-        group:
+                timr->it_process = leader;
-                return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
-                        timr->it_process);
        }
+        return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
+                                   timr->it_process);
 }
 EXPORT_SYMBOL_GPL(posix_timer_event);
@@ -1155,7 +1157,7 @@ retry_delete:
 }
 /*
- * This is called by __exit_signal, only when there are no more
+ * This is called by do_exit or de_thread, only when there are no more
 * references to the shared signal_struct.
 */
 void exit_itimers(struct signal_struct *sig)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 917066a5767c..46a5e5acff97 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,5 +1,6 @@
 config PM
        bool "Power Management support"
+        depends on !IA64_HP_SIM
        ---help---
          "Power Management" means that parts of your computer are shut
          off or put into a power conserving "sleep" mode if they are not
@@ -28,7 +29,7 @@ config PM_DEBUG
 config SOFTWARE_SUSPEND
        bool "Software Suspend"
-        depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP))
+        depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FVR || PPC32) && !SMP)
        ---help---
          Enable the possibility of suspending the machine.
          It doesn't need APM.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 2d8bf054d036..761956e813f5 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -17,12 +17,12 @@
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/pm.h>
 #include "power.h"
 extern suspend_disk_method_t pm_disk_mode;
-extern struct pm_ops * pm_ops;
 extern int swsusp_suspend(void);
 extern int swsusp_write(void);
@@ -49,13 +49,11 @@ dev_t swsusp_resume_device;
 static void power_down(suspend_disk_method_t mode)
 {
-        unsigned long flags;
        int error = 0;
-        local_irq_save(flags);
        switch(mode) {
        case PM_DISK_PLATFORM:
-                device_shutdown();
+                kernel_power_off_prepare();
                error = pm_ops->enter(PM_SUSPEND_DISK);
                break;
        case PM_DISK_SHUTDOWN:
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 61deda04e39e..159149321b3c 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type,
                           unsigned long id,
                           pm_callback callback)
 {
-        struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
+        struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
        if (dev) {
-                memset(dev, 0, sizeof(*dev));
                dev->type = type;
                dev->id = id;
                dev->callback = callback;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index cd6a3493cc0d..6748de23e83c 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -1,7 +1,7 @@
 #include <linux/suspend.h>
 #include <linux/utsname.h>
-/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but
+/* With SUSPEND_CONSOLE defined suspend looks *really* cool, but
   we probably do not take enough locks for switching consoles, etc,
   so bad things might happen.
 */
@@ -9,6 +9,9 @@
 #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
 #endif
+#define MAX_PBES        ((PAGE_SIZE - sizeof(struct new_utsname) \
+                        - 4 - 3*sizeof(unsigned long) - sizeof(int) \
+                        - sizeof(void *)) / sizeof(swp_entry_t))
 struct swsusp_info {
        struct new_utsname      uts;
@@ -18,7 +21,7 @@ struct swsusp_info {
        unsigned long           image_pages;
        unsigned long           pagedir_pages;
        suspend_pagedir_t       * suspend_pagedir;
-        swp_entry_t             pagedir[768];
+        swp_entry_t             pagedir[MAX_PBES];
 } __attribute__((aligned(PAGE_SIZE)));
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index eaacd5cb5889..2d5c45676442 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -363,7 +363,7 @@ static void lock_swapdevices(void)
 }
 /**
- *      write_swap_page - Write one page to a fresh swap location.
+ *      write_page - Write one page to a fresh swap location.
 *      @addr:  Address we're writing.
 *      @loc:   Place to store the entry we used.
 *
@@ -402,15 +402,14 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
 static void data_free(void)
 {
        swp_entry_t entry;
-        int i;
+        struct pbe * p;
-        for (i = 0; i < nr_copy_pages; i++) {
+        for_each_pbe(p, pagedir_nosave) {
-                entry = (pagedir_nosave + i)->swap_address;
+                entry = p->swap_address;
                if (entry.val)
                        swap_free(entry);
                else
                        break;
-                (pagedir_nosave + i)->swap_address = (swp_entry_t){0};
        }
 }
@@ -863,6 +862,9 @@ static int alloc_image_pages(void)
        return 0;
 }
+/* Free pages we allocated for suspend. Suspend pages are alocated
+ * before atomic copy, so we need to free them after resume.
+ */
 void swsusp_free(void)
 {
        BUG_ON(PageNosave(virt_to_page(pagedir_save)));
@@ -918,6 +920,7 @@ static int swsusp_alloc(void)
        pagedir_nosave = NULL;
        nr_copy_pages = calc_nr(nr_copy_pages);
+        nr_copy_pages_check = nr_copy_pages;
        pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
                 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
@@ -928,6 +931,10 @@ static int swsusp_alloc(void)
        if (!enough_swap())
                return -ENOSPC;
+        if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
+            !!(nr_copy_pages % PBES_PER_PAGE))
+                return -ENOSPC;
        if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
                printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
                return -ENOMEM;
@@ -940,7 +947,6 @@ static int swsusp_alloc(void)
                return error;
        }
-        nr_copy_pages_check = nr_copy_pages;
        return 0;
 }
@@ -1059,6 +1065,7 @@ int swsusp_resume(void)
        BUG_ON(!error);
        restore_processor_state();
        restore_highmem();
+        touch_softlockup_watchdog();
        device_power_up();
        local_irq_enable();
        return error;
@@ -1088,7 +1095,7 @@ static inline void eat_page(void *page)
        *eaten_memory = c;
 }
-static unsigned long get_usable_page(unsigned gfp_mask)
+unsigned long get_usable_page(unsigned gfp_mask)
 {
        unsigned long m;
@@ -1102,7 +1109,7 @@ static unsigned long get_usable_page(unsigned gfp_mask)
        return m;
 }
-static void free_eaten_memory(void)
+void free_eaten_memory(void)
 {
        unsigned long m;
        void **c;
@@ -1212,8 +1219,9 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
                free_pagedir(pblist);
                free_eaten_memory();
                pblist = NULL;
-        }
+                /* Is this even worth handling? It should never ever happen, and we
-        else
+                   have just lost user's state, anyway... */
+        } else
                printk("swsusp: Relocated %d pages\n", rel);
        return pblist;
@@ -1433,9 +1441,9 @@ static int read_pagedir(struct pbe *pblist)
        }
        if (error)
-                free_page((unsigned long)pblist);
+                free_pagedir(pblist);
+        else
-        BUG_ON(i != swsusp_info.pagedir_pages);
+                BUG_ON(i != swsusp_info.pagedir_pages);
        return error;
 }
@@ -1473,11 +1481,12 @@ static int read_suspend_image(void)
        /* Allocate memory for the image and read the data from swap */
        error = check_pagedir(pagedir_nosave);
-        free_eaten_memory();
        if (!error)
                error = data_read(pagedir_nosave);
        if (error) { /* We fail cleanly */
+                free_eaten_memory();
                for_each_pbe (p, pagedir_nosave)
                        if (p->address) {
                                free_page(p->address);
diff --git a/kernel/printk.c b/kernel/printk.c
index 5092397fac29..4b8f0f9230a4 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -488,6 +488,11 @@ static int __init printk_time_setup(char *str)
 __setup("time", printk_time_setup);
+__attribute__((weak)) unsigned long long printk_clock(void)
+{
+        return sched_clock();
+}
 /*
 * This is printk.  It can be called from any context.  We want it to work.
 * 
@@ -514,6 +519,9 @@ asmlinkage int printk(const char *fmt, ...)
        return r;
 }
+/* cpu currently holding logbuf_lock */
+static volatile unsigned int printk_cpu = UINT_MAX;
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
        unsigned long flags;
@@ -522,11 +530,15 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        static char printk_buf[1024];
        static int log_level_unknown = 1;
-        if (unlikely(oops_in_progress))
+        preempt_disable();
+        if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
+                /* If a crash is occurring during printk() on this CPU,
+                 * make sure we can't deadlock */
                zap_locks();
        /* This stops the holder of console_sem just where we want him */
        spin_lock_irqsave(&logbuf_lock, flags);
+        printk_cpu = smp_processor_id();
        /* Emit the output into the temporary buffer */
        printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
@@ -558,7 +570,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                                        loglev_char = default_message_loglevel
                                                + '0';
                                }
-                                t = sched_clock();
+                                t = printk_clock();
                                nanosec_rem = do_div(t, 1000000000);
                                tlen = sprintf(tbuf,
                                                "<%c>[%5lu.%06lu] ",
@@ -595,6 +607,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                 * CPU until it is officially up.  We shouldn't be calling into
                 * random console drivers on a CPU which doesn't exist yet..
                 */
+                printk_cpu = UINT_MAX;
                spin_unlock_irqrestore(&logbuf_lock, flags);
                goto out;
        }
@@ -604,6 +617,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                 * We own the drivers.  We can drop the spinlock and let
                 * release_console_sem() print the text
                 */
+                printk_cpu = UINT_MAX;
                spin_unlock_irqrestore(&logbuf_lock, flags);
                console_may_schedule = 0;
                release_console_sem();
@@ -613,9 +627,11 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                 * allows the semaphore holder to proceed and to call the
                 * console drivers with the output which we just produced.
                 */
+                printk_cpu = UINT_MAX;
                spin_unlock_irqrestore(&logbuf_lock, flags);
        }
 out:
+        preempt_enable();
        return printed_len;
 }
 EXPORT_SYMBOL(printk);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8dcb8f6288bc..019e04ec065a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill)
        return ret;
 }
+static int may_attach(struct task_struct *task)
+{
+        if (!task->mm)
+                return -EPERM;
+        if (((current->uid != task->euid) ||
+             (current->uid != task->suid) ||
+             (current->uid != task->uid) ||
+             (current->gid != task->egid) ||
+             (current->gid != task->sgid) ||
+             (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+                return -EPERM;
+        smp_rmb();
+        if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
+                return -EPERM;
+        return security_ptrace(current, task);
+}
+int ptrace_may_attach(struct task_struct *task)
+{
+        int err;
+        task_lock(task);
+        err = may_attach(task);
+        task_unlock(task);
+        return !err;
+}
 int ptrace_attach(struct task_struct *task)
 {
        int retval;
@@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task)
                goto bad;
        if (task == current)
                goto bad;
-        if (!task->mm)
-                goto bad;
-        if(((current->uid != task->euid) ||
-            (current->uid != task->suid) ||
-            (current->uid != task->uid) ||
-            (current->gid != task->egid) ||
-            (current->gid != task->sgid) ||
-            (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
-                goto bad;
-        smp_rmb();
-        if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
-                goto bad;
        /* the same process cannot be attached many times */
        if (task->ptrace & PT_PTRACED)
                goto bad;
-        retval = security_ptrace(current, task);
+        retval = may_attach(task);
        if (retval)
                goto bad;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f436993bd590..2559d4b8f23f 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
+#include <linux/rcuref.h>
 #include <linux/cpu.h>
 /* Definition for rcupdate control block. */
@@ -70,7 +71,20 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
-static int maxbatch = 10;
+static int maxbatch = 10000;
+#ifndef __HAVE_ARCH_CMPXCHG
+/*
+ * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
+ * 32 bit atomic_t implementations, and a hash function similar to that
+ * for our refcounting needs.
+ * Can't help multiprocessors which donot have cmpxchg :(
+ */
+spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
+        [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
+};
+#endif
 /**
 * call_rcu - Queue an RCU callback for invocation after a grace period.
@@ -95,6 +109,10 @@ void fastcall call_rcu(struct rcu_head *head,
        rdp = &__get_cpu_var(rcu_data);
        *rdp->nxttail = head;
        rdp->nxttail = &head->next;
+        if (unlikely(++rdp->count > 10000))
+                set_need_resched();
        local_irq_restore(flags);
 }
@@ -126,6 +144,12 @@ void fastcall call_rcu_bh(struct rcu_head *head,
        rdp = &__get_cpu_var(rcu_bh_data);
        *rdp->nxttail = head;
        rdp->nxttail = &head->next;
+        rdp->count++;
+/*
+ *  Should we directly call rcu_do_batch() here ?
+ *  if (unlikely(rdp->count > 10000))
+ *      rcu_do_batch(rdp);
+ */
        local_irq_restore(flags);
 }
@@ -143,6 +167,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
                next = rdp->donelist = list->next;
                list->func(list);
                list = next;
+                rdp->count--;
                if (++count >= maxbatch)
                        break;
        }
diff --git a/kernel/resource.c b/kernel/resource.c
index 26967e042201..92285d822de6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource);
 */
 struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
 {
-        struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
+        struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
        if (res) {
-                memset(res, 0, sizeof(*res));
                res->name = name;
                res->start = start;
                res->end = start + n - 1;
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..1e5cafdf4e27 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -294,6 +294,10 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
 static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
 {
+#ifdef CONFIG_DEBUG_SPINLOCK
+        /* this is a valid case when another task releases the spinlock */
+        rq->lock.owner = current;
+#endif
        spin_unlock_irq(&rq->lock);
 }
@@ -875,7 +879,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
 * smp_call_function() if an IPI is sent by the same process we are
 * waiting to become inactive.
 */
-void wait_task_inactive(task_t * p)
+void wait_task_inactive(task_t *p)
 {
        unsigned long flags;
        runqueue_t *rq;
@@ -966,8 +970,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                int local_group;
                int i;
+                /* Skip over this group if it has no CPUs allowed */
+                if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+                        goto nextgroup;
                local_group = cpu_isset(this_cpu, group->cpumask);
-                /* XXX: put a cpus allowed check */
                /* Tally up the load of all CPUs in the group */
                avg_load = 0;
@@ -992,6 +999,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                        min_load = avg_load;
                        idlest = group;
                }
+nextgroup:
                group = group->next;
        } while (group != sd->groups);
@@ -1003,13 +1011,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 /*
 * find_idlest_queue - find the idlest runqueue among the cpus in group.
 */
-static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
+        cpumask_t tmp;
        unsigned long load, min_load = ULONG_MAX;
        int idlest = -1;
        int i;
-        for_each_cpu_mask(i, group->cpumask) {
+        /* Traverse only the allowed CPUs */
+        cpus_and(tmp, group->cpumask, p->cpus_allowed);
+        for_each_cpu_mask(i, tmp) {
                load = source_load(i, 0);
                if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1052,7 +1065,7 @@ static int sched_balance_self(int cpu, int flag)
                if (!group)
                        goto nextlevel;
-                new_cpu = find_idlest_cpu(group, cpu);
+                new_cpu = find_idlest_cpu(group, t, cpu);
                if (new_cpu == -1 || new_cpu == cpu)
                        goto nextlevel;
@@ -1127,7 +1140,7 @@ static inline int wake_idle(int cpu, task_t *p)
 *
 * returns failure only if the task is already active.
 */
-static int try_to_wake_up(task_t * p, unsigned int state, int sync)
+static int try_to_wake_up(task_t *p, unsigned int state, int sync)
 {
        int cpu, this_cpu, success = 0;
        unsigned long flags;
@@ -1252,6 +1265,16 @@ out_activate:
        }
        /*
+         * Tasks that have marked their sleep as noninteractive get
+         * woken up without updating their sleep average. (i.e. their
+         * sleep is handled in a priority-neutral manner, no priority
+         * boost and no penalty.)
+         */
+        if (old_state & TASK_NONINTERACTIVE)
+                __activate_task(p, rq);
+        else
+                activate_task(p, rq, cpu == this_cpu);
+        /*
         * Sync wakeups (i.e. those types of wakeups where the waker
         * has indicated that it will leave the CPU in short order)
         * don't trigger a preemption, if the woken up task will run on
@@ -1259,7 +1282,6 @@ out_activate:
         * the waker guarantees that the freshly woken up task is going
         * to be considered on this CPU.)
         */
-        activate_task(p, rq, cpu == this_cpu);
        if (!sync || cpu != this_cpu) {
                if (TASK_PREEMPTS_CURR(p, rq))
                        resched_task(rq->curr);
@@ -1274,7 +1296,7 @@ out:
        return success;
 }
-int fastcall wake_up_process(task_t * p)
+int fastcall wake_up_process(task_t *p)
 {
        return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
                                 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
@@ -1353,7 +1375,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
-void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
+void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
 {
        unsigned long flags;
        int this_cpu, cpu;
@@ -1436,7 +1458,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 * artificially, because any timeslice recovered here
 * was given away by the parent in the first place.)
 */
-void fastcall sched_exit(task_t * p)
+void fastcall sched_exit(task_t *p)
 {
        unsigned long flags;
        runqueue_t *rq;
@@ -1478,6 +1500,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
 /**
 * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
 * @prev: the thread we just switched away from.
 *
 * finish_task_switch must be called after the context switch, paired
@@ -1752,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
 */
 static inline
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-             struct sched_domain *sd, enum idle_type idle, int *all_pinned)
+                     struct sched_domain *sd, enum idle_type idle,
+                     int *all_pinned)
 {
        /*
         * We do not migrate tasks that are:
@@ -1882,10 +1906,11 @@ out:
 */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-                   unsigned long *imbalance, enum idle_type idle)
+                   unsigned long *imbalance, enum idle_type idle, int *sd_idle)
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+        unsigned long max_pull;
        int load_idx;
        max_load = this_load = total_load = total_pwr = 0;
@@ -1907,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                avg_load = 0;
                for_each_cpu_mask(i, group->cpumask) {
+                        if (*sd_idle && !idle_cpu(i))
+                                *sd_idle = 0;
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
                                load = target_load(i, load_idx);
@@ -1932,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                group = group->next;
        } while (group != sd->groups);
-        if (!busiest || this_load >= max_load)
+        if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
                goto out_balanced;
        avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -1952,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         * by pulling tasks to us.  Be careful of negative numbers as they'll
         * appear as very large values with unsigned longs.
         */
+        /* Don't want to pull so many tasks that a group would go idle */
+        max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
        /* How much load to actually move to equalise the imbalance */
-        *imbalance = min((max_load - avg_load) * busiest->cpu_power,
+        *imbalance = min(max_pull * busiest->cpu_power,
                                (avg_load - this_load) * this->cpu_power)
                        / SCHED_LOAD_SCALE;
@@ -2050,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
        unsigned long imbalance;
        int nr_moved, all_pinned = 0;
        int active_balance = 0;
+        int sd_idle = 0;
+        if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+                sd_idle = 1;
-        spin_lock(&this_rq->lock);
        schedstat_inc(sd, lb_cnt[idle]);
-        group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[idle]);
                goto out_balanced;
@@ -2078,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                 * still unbalanced. nr_moved simply stays zero, so it is
                 * correctly treated as an imbalance.
                 */
-                double_lock_balance(this_rq, busiest);
+                double_rq_lock(this_rq, busiest);
                nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                                imbalance, sd, idle,
+                                        imbalance, sd, idle, &all_pinned);
-                                                &all_pinned);
+                double_rq_unlock(this_rq, busiest);
-                spin_unlock(&busiest->lock);
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(all_pinned))
                        goto out_balanced;
        }
-        spin_unlock(&this_rq->lock);
        if (!nr_moved) {
                schedstat_inc(sd, lb_failed[idle]);
                sd->nr_balance_failed++;
@@ -2098,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
                        spin_lock(&busiest->lock);
+                        /* don't kick the migration_thread, if the curr
+                         * task on busiest cpu can't be moved to this_cpu
+                         */
+                        if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                                spin_unlock(&busiest->lock);
+                                all_pinned = 1;
+                                goto out_one_pinned;
+                        }
                        if (!busiest->active_balance) {
                                busiest->active_balance = 1;
                                busiest->push_cpu = this_cpu;
@@ -2130,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                        sd->balance_interval *= 2;
        }
+        if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+                return -1;
        return nr_moved;
 out_balanced:
-        spin_unlock(&this_rq->lock);
        schedstat_inc(sd, lb_balanced[idle]);
        sd->nr_balance_failed = 0;
+out_one_pinned:
        /* tune up the balancing interval */
        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
                        (sd->balance_interval < sd->max_interval))
                sd->balance_interval *= 2;
+        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+                return -1;
        return 0;
 }
@@ -2160,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
        runqueue_t *busiest = NULL;
        unsigned long imbalance;
        int nr_moved = 0;
+        int sd_idle = 0;
+        if (sd->flags & SD_SHARE_CPUPOWER)
+                sd_idle = 1;
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
-        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
                goto out_balanced;
@@ -2176,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
        BUG_ON(busiest == this_rq);
-        /* Attempt to move tasks */
-        double_lock_balance(this_rq, busiest);
        schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
-        nr_moved = move_tasks(this_rq, this_cpu, busiest,
+        nr_moved = 0;
+        if (busiest->nr_running > 1) {
+                /* Attempt to move tasks */
+                double_lock_balance(this_rq, busiest);
+                nr_moved = move_tasks(this_rq, this_cpu, busiest,
                                        imbalance, sd, NEWLY_IDLE, NULL);
-        if (!nr_moved)
+                spin_unlock(&busiest->lock);
+        }
+        if (!nr_moved) {
                schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
-        else
+                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+                        return -1;
+        } else
                sd->nr_balance_failed = 0;
-        spin_unlock(&busiest->lock);
        return nr_moved;
 out_balanced:
        schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+                return -1;
        sd->nr_balance_failed = 0;
        return 0;
 }
@@ -2316,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
                if (j - sd->last_balance >= interval) {
                        if (load_balance(this_cpu, this_rq, sd, idle)) {
-                                /* We've pulled tasks over so no longer idle */
+                                /*
+                                 * We've pulled tasks over so either we're no
+                                 * longer idle, or one of our SMT siblings is
+                                 * not idle.
+                                 */
                                idle = NOT_IDLE;
                        }
                        sd->last_balance += interval;
@@ -2575,6 +2637,13 @@ out:
 }
 #ifdef CONFIG_SCHED_SMT
+static inline void wakeup_busy_runqueue(runqueue_t *rq)
+{
+        /* If an SMT runqueue is sleeping due to priority reasons wake it up */
+        if (rq->curr == rq->idle && rq->nr_running)
+                resched_task(rq->idle);
+}
 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
        struct sched_domain *tmp, *sd = NULL;
@@ -2608,12 +2677,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
        for_each_cpu_mask(i, sibling_map) {
                runqueue_t *smt_rq = cpu_rq(i);
-                /*
+                wakeup_busy_runqueue(smt_rq);
-                 * If an SMT sibling task is sleeping due to priority
-                 * reasons wake it up now.
-                 */
-                if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
-                        resched_task(smt_rq->idle);
        }
        for_each_cpu_mask(i, sibling_map)
@@ -2624,6 +2688,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
         */
 }
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+{
+        return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
        struct sched_domain *tmp, *sd = NULL;
@@ -2667,6 +2741,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
                runqueue_t *smt_rq = cpu_rq(i);
                task_t *smt_curr = smt_rq->curr;
+                /* Kernel threads do not participate in dependent sleeping */
+                if (!p->mm || !smt_curr->mm || rt_task(p))
+                        goto check_smt_task;
                /*
                 * If a user task with lower static priority than the
                 * running task on the SMT sibling is trying to schedule,
@@ -2675,21 +2753,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
                 * task from using an unfair proportion of the
                 * physical cpu's resources. -ck
                 */
-                if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
+                if (rt_task(smt_curr)) {
-                        task_timeslice(p) || rt_task(smt_curr)) &&
+                        /*
-                        p->mm && smt_curr->mm && !rt_task(p))
+                         * With real time tasks we run non-rt tasks only
-                                ret = 1;
+                         * per_cpu_gain% of the time.
+                         */
+                        if ((jiffies % DEF_TIMESLICE) >
+                                (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+                                        ret = 1;
+                } else
+                        if (smt_curr->static_prio < p->static_prio &&
+                                !TASK_PREEMPTS_CURR(p, smt_rq) &&
+                                smt_slice(smt_curr, sd) > task_timeslice(p))
+                                        ret = 1;
+check_smt_task:
+                if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
+                        rt_task(smt_curr))
+                                continue;
+                if (!p->mm) {
+                        wakeup_busy_runqueue(smt_rq);
+                        continue;
+                }
                /*
-                 * Reschedule a lower priority task on the SMT sibling,
+                 * Reschedule a lower priority task on the SMT sibling for
-                 * or wake it up if it has been put to sleep for priority
+                 * it to be put to sleep, or wake it up if it has been put to
-                 * reasons.
+                 * sleep for priority reasons to see if it should run now.
                 */
-                if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
+                if (rt_task(p)) {
-                        task_timeslice(smt_curr) || rt_task(p)) &&
+                        if ((jiffies % DEF_TIMESLICE) >
-                        smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
+                                (sd->per_cpu_gain * DEF_TIMESLICE / 100))
-                        (smt_curr == smt_rq->idle && smt_rq->nr_running))
+                                        resched_task(smt_curr);
-                                resched_task(smt_curr);
+                } else {
+                        if (TASK_PREEMPTS_CURR(p, smt_rq) &&
+                                smt_slice(p, sd) > task_timeslice(smt_curr))
+                                        resched_task(smt_curr);
+                        else
+                                wakeup_busy_runqueue(smt_rq);
+                }
        }
 out_unlock:
        for_each_cpu_mask(i, sibling_map)
@@ -2887,6 +2989,7 @@ switch_tasks:
        if (next == rq->idle)
                schedstat_inc(rq, sched_goidle);
        prefetch(next);
+        prefetch_stack(next);
        clear_tsk_need_resched(prev);
        rcu_qsctr_inc(task_cpu(prev));
@@ -3014,7 +3117,8 @@ need_resched:
 #endif /* CONFIG_PREEMPT */
-int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+                          void *key)
 {
        task_t *p = curr->private;
        return try_to_wake_up(p, mode, sync);
@@ -3056,7 +3160,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 * @key: is directly passed to the wakeup function
 */
 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
-                                int nr_exclusive, void *key)
+                        int nr_exclusive, void *key)
 {
        unsigned long flags;
@@ -3088,7 +3192,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 *
 * On UP it can prevent extra preemption.
 */
-void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void fastcall
+__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
        unsigned long flags;
        int sync = 1;
@@ -3279,7 +3384,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
 EXPORT_SYMBOL(interruptible_sleep_on);
-long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
        SLEEP_ON_VAR
@@ -3498,7 +3604,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
 * @policy: new policy.
 * @param: structure containing the new RT priority.
 */
-int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
+int sched_setscheduler(struct task_struct *p, int policy,
+                       struct sched_param *param)
 {
        int retval;
        int oldprio, oldpolicy = -1;
@@ -3518,7 +3625,7 @@ recheck:
         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
         */
        if (param->sched_priority < 0 ||
-            (p->mm &&  param->sched_priority > MAX_USER_RT_PRIO-1) ||
+            (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
            (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
                return -EINVAL;
        if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
@@ -3581,7 +3688,8 @@ recheck:
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
-static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
        int retval;
        struct sched_param lparam;
@@ -3771,6 +3879,7 @@ EXPORT_SYMBOL(cpu_present_map);
 #ifndef CONFIG_SMP
 cpumask_t cpu_online_map = CPU_MASK_ALL;
+EXPORT_SYMBOL_GPL(cpu_online_map);
 cpumask_t cpu_possible_map = CPU_MASK_ALL;
 #endif
@@ -3848,7 +3957,7 @@ asmlinkage long sys_sched_yield(void)
        if (rt_task(current))
                target = rq->active;
-        if (current->array->nr_active == 1) {
+        if (array->nr_active == 1) {
                schedstat_inc(rq, yld_act_empty);
                if (!rq->expired->nr_active)
                        schedstat_inc(rq, yld_both_empty);
@@ -3912,7 +4021,7 @@ EXPORT_SYMBOL(cond_resched);
 * operations here to prevent schedule() from being called twice (once via
 * spin_unlock(), once by hand).
 */
-int cond_resched_lock(spinlock_t * lock)
+int cond_resched_lock(spinlock_t *lock)
 {
        int ret = 0;
@@ -4095,7 +4204,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
        return list_entry(p->sibling.next,struct task_struct,sibling);
 }
-static void show_task(task_t * p)
+static void show_task(task_t *p)
 {
        task_t *relative;
        unsigned state;
@@ -4121,7 +4230,7 @@ static void show_task(task_t * p)
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
        {
-                unsigned long * n = (unsigned long *) (p->thread_info+1);
+                unsigned long *n = (unsigned long *) (p->thread_info+1);
                while (!*n)
                        n++;
                free = (unsigned long) n - (unsigned long)(p->thread_info+1);
@@ -4330,7 +4439,7 @@ out:
 * thread migration by bumping thread off CPU then 'pushing' onto
 * another runqueue.
 */
-static int migration_thread(void * data)
+static int migration_thread(void *data)
 {
        runqueue_t *rq;
        int cpu = (long)data;
@@ -4779,7 +4888,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
 * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
 * hold the hotplug lock.
 */
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
        runqueue_t *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
@@ -4802,7 +4911,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4939,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
 * covered by the given span, and will set each group's ->cpumask correctly,
 * and ->cpu_power to 0.
 */
-void init_sched_build_groups(struct sched_group groups[],
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
-                        cpumask_t span, int (*group_fn)(int cpu))
+                                    int (*group_fn)(int cpu))
 {
        struct sched_group *first = NULL, *last = NULL;
        cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4973,85 @@ void init_sched_build_groups(struct sched_group groups[],
        last->next = first;
 }
+#define SD_NODES_PER_DOMAIN 16
-#ifdef ARCH_HAS_SCHED_DOMAIN
+#ifdef CONFIG_NUMA
-extern void build_sched_domains(const cpumask_t *cpu_map);
+/**
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+ * find_next_best_node - find the next node to include in a sched_domain
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
+ * @node: node whose sched_domain we're building
-#else
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+        int i, n, val, min_val, best_node = 0;
+        min_val = INT_MAX;
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                /* Start at @node */
+                n = (node + i) % MAX_NUMNODES;
+                if (!nr_cpus_node(n))
+                        continue;
+                /* Skip already used nodes */
+                if (test_bit(n, used_nodes))
+                        continue;
+                /* Simple min distance search */
+                val = node_distance(node, n);
+                if (val < min_val) {
+                        min_val = val;
+                        best_node = n;
+                }
+        }
+        set_bit(best_node, used_nodes);
+        return best_node;
+}
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+        int i;
+        cpumask_t span, nodemask;
+        DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+        cpus_clear(span);
+        bitmap_zero(used_nodes, MAX_NUMNODES);
+        nodemask = node_to_cpumask(node);
+        cpus_or(span, span, nodemask);
+        set_bit(node, used_nodes);
+        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+                int next_node = find_next_best_node(node, used_nodes);
+                nodemask = node_to_cpumask(next_node);
+                cpus_or(span, span, nodemask);
+        }
+        return span;
+}
+#endif
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +5073,20 @@ static int cpu_to_phys_group(int cpu)
 }
 #ifdef CONFIG_NUMA
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
-        return cpu_to_node(cpu);
-}
-#endif
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
 /*
- * The domains setup code relies on siblings not spanning
+ * The init_sched_build_groups can't handle what we want to do with node
- * multiple nodes. Make sure the architecture has a proper
+ * groups, so roll our own. Now each node has its own list of groups which
- * siblings map:
+ * gets dynamically allocated.
 */
-static void check_sibling_maps(void)
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
-{
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
-        int i, j;
-        for_each_online_cpu(i) {
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-                for_each_cpu_mask(j, cpu_sibling_map[i]) {
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
-                        if (cpu_to_node(i) != cpu_to_node(j)) {
-                                printk(KERN_INFO "warning: CPU %d siblings map "
+static int cpu_to_allnodes_group(int cpu)
-                                        "to different node - isolating "
+{
-                                        "them.\n", i);
+        return cpu_to_node(cpu);
-                                cpu_sibling_map[i] = cpumask_of_cpu(i);
-                                break;
-                        }
-                }
-        }
 }
 #endif
@@ -4928,9 +5094,24 @@ static void check_sibling_maps(void)
 * Build sched domains for a given set of cpus and attach the sched domains
 * to the individual cpus
 */
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
        int i;
+#ifdef CONFIG_NUMA
+        struct sched_group **sched_group_nodes = NULL;
+        struct sched_group *sched_group_allnodes = NULL;
+        /*
+         * Allocate the per-node list of sched groups
+         */
+        sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+                                           GFP_ATOMIC);
+        if (!sched_group_nodes) {
+                printk(KERN_WARNING "Can not alloc sched group node list\n");
+                return;
+        }
+        sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
        /*
         * Set up domains for cpus specified by the cpu_map.
@@ -4943,11 +5124,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
                cpus_and(nodemask, nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
+                if (cpus_weight(*cpu_map)
+                                > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+                        if (!sched_group_allnodes) {
+                                sched_group_allnodes
+                                        = kmalloc(sizeof(struct sched_group)
+                                                        * MAX_NUMNODES,
+                                                  GFP_KERNEL);
+                                if (!sched_group_allnodes) {
+                                        printk(KERN_WARNING
+                                        "Can not alloc allnodes sched group\n");
+                                        break;
+                                }
+                                sched_group_allnodes_bycpu[i]
+                                                = sched_group_allnodes;
+                        }
+                        sd = &per_cpu(allnodes_domains, i);
+                        *sd = SD_ALLNODES_INIT;
+                        sd->span = *cpu_map;
+                        group = cpu_to_allnodes_group(i);
+                        sd->groups = &sched_group_allnodes[group];
+                        p = sd;
+                } else
+                        p = NULL;
                sd = &per_cpu(node_domains, i);
-                group = cpu_to_node_group(i);
                *sd = SD_NODE_INIT;
-                sd->span = *cpu_map;
+                sd->span = sched_domain_node_span(cpu_to_node(i));
-                sd->groups = &sched_group_nodes[group];
+                sd->parent = p;
+                cpus_and(sd->span, sd->span, *cpu_map);
 #endif
                p = sd;
@@ -4972,7 +5177,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_SCHED_SMT
        /* Set up CPU (sibling) groups */
-        for_each_online_cpu(i) {
+        for_each_cpu_mask(i, *cpu_map) {
                cpumask_t this_sibling_map = cpu_sibling_map[i];
                cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
                if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5202,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_NUMA
        /* Set up node groups */
-        init_sched_build_groups(sched_group_nodes, *cpu_map,
+        if (sched_group_allnodes)
-                                        &cpu_to_node_group);
+                init_sched_build_groups(sched_group_allnodes, *cpu_map,
+                                        &cpu_to_allnodes_group);
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                /* Set up node groups */
+                struct sched_group *sg, *prev;
+                cpumask_t nodemask = node_to_cpumask(i);
+                cpumask_t domainspan;
+                cpumask_t covered = CPU_MASK_NONE;
+                int j;
+                cpus_and(nodemask, nodemask, *cpu_map);
+                if (cpus_empty(nodemask)) {
+                        sched_group_nodes[i] = NULL;
+                        continue;
+                }
+                domainspan = sched_domain_node_span(i);
+                cpus_and(domainspan, domainspan, *cpu_map);
+                sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                sched_group_nodes[i] = sg;
+                for_each_cpu_mask(j, nodemask) {
+                        struct sched_domain *sd;
+                        sd = &per_cpu(node_domains, j);
+                        sd->groups = sg;
+                        if (sd->groups == NULL) {
+                                /* Turn off balancing if we have no groups */
+                                sd->flags = 0;
+                        }
+                }
+                if (!sg) {
+                        printk(KERN_WARNING
+                        "Can not alloc domain group for node %d\n", i);
+                        continue;
+                }
+                sg->cpu_power = 0;
+                sg->cpumask = nodemask;
+                cpus_or(covered, covered, nodemask);
+                prev = sg;
+                for (j = 0; j < MAX_NUMNODES; j++) {
+                        cpumask_t tmp, notcovered;
+                        int n = (i + j) % MAX_NUMNODES;
+                        cpus_complement(notcovered, covered);
+                        cpus_and(tmp, notcovered, *cpu_map);
+                        cpus_and(tmp, tmp, domainspan);
+                        if (cpus_empty(tmp))
+                                break;
+                        nodemask = node_to_cpumask(n);
+                        cpus_and(tmp, tmp, nodemask);
+                        if (cpus_empty(tmp))
+                                continue;
+                        sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                        if (!sg) {
+                                printk(KERN_WARNING
+                                "Can not alloc domain group for node %d\n", j);
+                                break;
+                        }
+                        sg->cpu_power = 0;
+                        sg->cpumask = tmp;
+                        cpus_or(covered, covered, tmp);
+                        prev->next = sg;
+                        prev = sg;
+                }
+                prev->next = sched_group_nodes[i];
+        }
 #endif
        /* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5291,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
                sd->groups->cpu_power = power;
 #ifdef CONFIG_NUMA
-                if (i == first_cpu(sd->groups->cpumask)) {
+                sd = &per_cpu(allnodes_domains, i);
-                        /* Only add "power" once for each physical package. */
+                if (sd->groups) {
-                        sd = &per_cpu(node_domains, i);
+                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                        sd->groups->cpu_power += power;
+                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                        sd->groups->cpu_power = power;
                }
 #endif
        }
+#ifdef CONFIG_NUMA
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                struct sched_group *sg = sched_group_nodes[i];
+                int j;
+                if (sg == NULL)
+                        continue;
+next_sg:
+                for_each_cpu_mask(j, sg->cpumask) {
+                        struct sched_domain *sd;
+                        int power;
+                        sd = &per_cpu(phys_domains, j);
+                        if (j != first_cpu(sd->groups->cpumask)) {
+                                /*
+                                 * Only add "power" once for each
+                                 * physical package.
+                                 */
+                                continue;
+                        }
+                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                        sg->cpu_power += power;
+                }
+                sg = sg->next;
+                if (sg != sched_group_nodes[i])
+                        goto next_sg;
+        }
+#endif
        /* Attach the domains */
        for_each_cpu_mask(i, *cpu_map) {
                struct sched_domain *sd;
@@ -5039,13 +5345,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 /*
 * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
 */
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
        cpumask_t cpu_default_map;
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-        check_sibling_maps();
-#endif
        /*
         * Setup mask for cpus without special case scheduling requirements.
         * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5361,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-        /* Do nothing: everything is statically allocated. */
+#ifdef CONFIG_NUMA
-}
+        int i;
+        int cpu;
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+        for_each_cpu_mask(cpu, *cpu_map) {
+                struct sched_group *sched_group_allnodes
+                        = sched_group_allnodes_bycpu[cpu];
+                struct sched_group **sched_group_nodes
+                        = sched_group_nodes_bycpu[cpu];
+                if (sched_group_allnodes) {
+                        kfree(sched_group_allnodes);
+                        sched_group_allnodes_bycpu[cpu] = NULL;
+                }
+                if (!sched_group_nodes)
+                        continue;
+                for (i = 0; i < MAX_NUMNODES; i++) {
+                        cpumask_t nodemask = node_to_cpumask(i);
+                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
+                        cpus_and(nodemask, nodemask, *cpu_map);
+                        if (cpus_empty(nodemask))
+                                continue;
+                        if (sg == NULL)
+                                continue;
+                        sg = sg->next;
+next_sg:
+                        oldsg = sg;
+                        sg = sg->next;
+                        kfree(oldsg);
+                        if (oldsg != sched_group_nodes[i])
+                                goto next_sg;
+                }
+                kfree(sched_group_nodes);
+                sched_group_nodes_bycpu[cpu] = NULL;
+        }
+#endif
+}
 /*
 * Detach sched domains from a group of cpus specified in cpu_map
@@ -5263,3 +5603,47 @@ void normalize_rt_tasks(void)
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
+#ifdef CONFIG_IA64
+/*
+ * These functions are only useful for the IA64 MCA handling.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+task_t *curr_task(int cpu)
+{
+        return cpu_curr(cpu);
+}
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack.  It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner.  This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, task_t *p)
+{
+        cpu_curr(cpu) = p;
+}
+#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index d282fea81138..f2b96b08fb44 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -262,7 +262,7 @@ next_signal(struct sigpending *pending, sigset_t *mask)
        return sig;
 }
-static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags,
+static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
                                         int override_rlimit)
 {
        struct sigqueue *q = NULL;
@@ -397,20 +397,8 @@ void __exit_signal(struct task_struct *tsk)
        flush_sigqueue(&tsk->pending);
        if (sig) {
                /*
-                 * We are cleaning up the signal_struct here.  We delayed
+                 * We are cleaning up the signal_struct here.
-                 * calling exit_itimers until after flush_sigqueue, just in
-                 * case our thread-local pending queue contained a queued
-                 * timer signal that would have been cleared in
-                 * exit_itimers.  When that called sigqueue_free, it would
-                 * attempt to re-take the tasklist_lock and deadlock.  This
-                 * can never happen if we ensure that all queues the
-                 * timer's signal might be queued on have been flushed
-                 * first.  The shared_pending queue, and our own pending
-                 * queue are the only queues the timer could be on, since
-                 * there are no other threads left in the group and timer
-                 * signals are constrained to threads inside the group.
                 */
-                exit_itimers(sig);
                exit_thread_group_keys(sig);
                kmem_cache_free(signal_cachep, sig);
        }
@@ -578,7 +566,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                 * is to alert stop-signal processing code when another
                 * processor has come along and cleared the flag.
                 */
-                tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+                if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
+                        tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
        }
        if ( signr &&
             ((info->si_code & __SI_MASK) == __SI_TIMER) &&
@@ -678,7 +667,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 /* forward decl */
 static void do_notify_parent_cldstop(struct task_struct *tsk,
-                                     struct task_struct *parent,
+                                     int to_self,
                                     int why);
 /*
@@ -729,14 +718,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
                        p->signal->group_stop_count = 0;
                        p->signal->flags = SIGNAL_STOP_CONTINUED;
                        spin_unlock(&p->sighand->siglock);
-                        if (p->ptrace & PT_PTRACED)
+                        do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED);
-                                do_notify_parent_cldstop(p, p->parent,
-                                                         CLD_STOPPED);
-                        else
-                                do_notify_parent_cldstop(
-                                        p->group_leader,
-                                        p->group_leader->real_parent,
-                                                         CLD_STOPPED);
                        spin_lock(&p->sighand->siglock);
                }
                rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -777,14 +759,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
                        p->signal->flags = SIGNAL_STOP_CONTINUED;
                        p->signal->group_exit_code = 0;
                        spin_unlock(&p->sighand->siglock);
-                        if (p->ptrace & PT_PTRACED)
+                        do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED);
-                                do_notify_parent_cldstop(p, p->parent,
-                                                         CLD_CONTINUED);
-                        else
-                                do_notify_parent_cldstop(
-                                        p->group_leader,
-                                        p->group_leader->real_parent,
-                                                         CLD_CONTINUED);
                        spin_lock(&p->sighand->siglock);
                } else {
                        /*
@@ -950,34 +925,31 @@ force_sig_specific(int sig, struct task_struct *t)
 * as soon as they're available, so putting the signal on the shared queue
 * will be equivalent to sending it to one such thread.
 */
-#define wants_signal(sig, p, mask)                      \
+static inline int wants_signal(int sig, struct task_struct *p)
-        (!sigismember(&(p)->blocked, sig)               \
+{
-         && !((p)->state & mask)                        \
+        if (sigismember(&p->blocked, sig))
-         && !((p)->flags & PF_EXITING)                  \
+                return 0;
-         && (task_curr(p) || !signal_pending(p)))
+        if (p->flags & PF_EXITING)
+                return 0;
+        if (sig == SIGKILL)
+                return 1;
+        if (p->state & (TASK_STOPPED | TASK_TRACED))
+                return 0;
+        return task_curr(p) || !signal_pending(p);
+}
 static void
 __group_complete_signal(int sig, struct task_struct *p)
 {
-        unsigned int mask;
        struct task_struct *t;
        /*
-         * Don't bother traced and stopped tasks (but
-         * SIGKILL will punch through that).
-         */
-        mask = TASK_STOPPED | TASK_TRACED;
-        if (sig == SIGKILL)
-                mask = 0;
-        /*
         * Now find a thread we can wake up to take the signal off the queue.
         *
         * If the main thread wants the signal, it gets first crack.
         * Probably the least surprising to the average bear.
         */
-        if (wants_signal(sig, p, mask))
+        if (wants_signal(sig, p))
                t = p;
        else if (thread_group_empty(p))
                /*
@@ -995,7 +967,7 @@ __group_complete_signal(int sig, struct task_struct *p)
                        t = p->signal->curr_target = p;
                BUG_ON(t->tgid != p->tgid);
-                while (!wants_signal(sig, t, mask)) {
+                while (!wants_signal(sig, t)) {
                        t = next_thread(t);
                        if (t == p->signal->curr_target)
                                /*
@@ -1209,6 +1181,40 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
        return error;
 }
+/* like kill_proc_info(), but doesn't use uid/euid of "current" */
+int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
+                      uid_t uid, uid_t euid)
+{
+        int ret = -EINVAL;
+        struct task_struct *p;
+        if (!valid_signal(sig))
+                return ret;
+        read_lock(&tasklist_lock);
+        p = find_task_by_pid(pid);
+        if (!p) {
+                ret = -ESRCH;
+                goto out_unlock;
+        }
+        if ((!info || ((unsigned long)info != 1 &&
+                        (unsigned long)info != 2 && SI_FROMUSER(info)))
+            && (euid != p->suid) && (euid != p->uid)
+            && (uid != p->suid) && (uid != p->uid)) {
+                ret = -EPERM;
+                goto out_unlock;
+        }
+        if (sig && p->sighand) {
+                unsigned long flags;
+                spin_lock_irqsave(&p->sighand->siglock, flags);
+                ret = __group_send_sig_info(sig, info, p);
+                spin_unlock_irqrestore(&p->sighand->siglock, flags);
+        }
+out_unlock:
+        read_unlock(&tasklist_lock);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kill_proc_info_as_uid);
 /*
 * kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1380,16 +1386,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
        unsigned long flags;
        int ret = 0;
-        /*
-         * We need the tasklist lock even for the specific
-         * thread case (when we don't need to follow the group
-         * lists) in order to avoid races with "p->sighand"
-         * going away or changing from under us.
-         */
        BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-        read_lock(&tasklist_lock);  
+        read_lock(&tasklist_lock);
+        if (unlikely(p->flags & PF_EXITING)) {
+                ret = -1;
+                goto out_err;
+        }
        spin_lock_irqsave(&p->sighand->siglock, flags);
-        
        if (unlikely(!list_empty(&q->list))) {
                /*
                 * If an SI_TIMER entry is already queue just increment
@@ -1399,7 +1405,7 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
                        BUG();
                q->info.si_overrun++;
                goto out;
-        } 
+        }
        /* Short-circuit ignored signals.  */
        if (sig_ignored(p, sig)) {
                ret = 1;
@@ -1414,8 +1420,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 out:
        spin_unlock_irqrestore(&p->sighand->siglock, flags);
+out_err:
        read_unlock(&tasklist_lock);
-        return(ret);
+        return ret;
 }
 int
@@ -1542,14 +1550,20 @@ void do_notify_parent(struct task_struct *tsk, int sig)
        spin_unlock_irqrestore(&psig->siglock, flags);
 }
-static void
+static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why)
-do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent,
-                         int why)
 {
        struct siginfo info;
        unsigned long flags;
+        struct task_struct *parent;
        struct sighand_struct *sighand;
+        if (to_self)
+                parent = tsk->parent;
+        else {
+                tsk = tsk->group_leader;
+                parent = tsk->real_parent;
+        }
        info.si_signo = SIGCHLD;
        info.si_errno = 0;
        info.si_pid = tsk->pid;
@@ -1618,8 +1632,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
                   !(current->ptrace & PT_ATTACHED)) &&
            (likely(current->parent->signal != current->signal) ||
             !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
-                do_notify_parent_cldstop(current, current->parent,
+                do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
-                                         CLD_TRAPPED);
                read_unlock(&tasklist_lock);
                schedule();
        } else {
@@ -1668,25 +1681,25 @@ void ptrace_notify(int exit_code)
 static void
 finish_stop(int stop_count)
 {
+        int to_self;
        /*
         * If there are no other threads in the group, or if there is
         * a group stop in progress and we are the last to stop,
         * report to the parent.  When ptraced, every thread reports itself.
         */
-        if (stop_count < 0 || (current->ptrace & PT_PTRACED)) {
+        if (stop_count < 0 || (current->ptrace & PT_PTRACED))
-                read_lock(&tasklist_lock);
+                to_self = 1;
-                do_notify_parent_cldstop(current, current->parent,
+        else if (stop_count == 0)
-                                         CLD_STOPPED);
+                to_self = 0;
-                read_unlock(&tasklist_lock);
+        else
-        }
+                goto out;
-        else if (stop_count == 0) {
-                read_lock(&tasklist_lock);
-                do_notify_parent_cldstop(current->group_leader,
-                                         current->group_leader->real_parent,
-                                         CLD_STOPPED);
-                read_unlock(&tasklist_lock);
-        }
+        read_lock(&tasklist_lock);
+        do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
+        read_unlock(&tasklist_lock);
+out:
        schedule();
        /*
         * Now we don't run again until continued.
@@ -1773,7 +1786,8 @@ do_signal_stop(int signr)
                                 * stop is always done with the siglock held,
                                 * so this check has no races.
                                 */
-                                if (t->state < TASK_STOPPED) {
+                                if (!t->exit_state &&
+                                    !(t->state & (TASK_STOPPED|TASK_TRACED))) {
                                        stop_count++;
                                        signal_wake_up(t, 0);
                                }
@@ -2228,8 +2242,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
                        recalc_sigpending();
                        spin_unlock_irq(&current->sighand->siglock);
-                        current->state = TASK_INTERRUPTIBLE;
+                        timeout = schedule_timeout_interruptible(timeout);
-                        timeout = schedule_timeout(timeout);
                        try_to_freeze();
                        spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b4ab6af1dea8..f766b2fc48be 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -84,7 +84,7 @@ asmlinkage void __do_softirq(void)
        cpu = smp_processor_id();
 restart:
        /* Reset the pending bitmask before enabling irqs */
-        local_softirq_pending() = 0;
+        set_softirq_pending(0);
        local_irq_enable();
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 000000000000..75976209cea7
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,151 @@
+/*
+ * Detect Soft Lockups
+ *
+ * started by Ingo Molnar, (C) 2005, Red Hat
+ *
+ * this code detects soft lockups: incidents in where on a CPU
+ * the kernel does not reschedule for 10 seconds or more.
+ */
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+static DEFINE_SPINLOCK(print_lock);
+static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+static int did_panic = 0;
+static int softlock_panic(struct notifier_block *this, unsigned long event,
+                                void *ptr)
+{
+        did_panic = 1;
+        return NOTIFY_DONE;
+}
+static struct notifier_block panic_block = {
+        .notifier_call = softlock_panic,
+};
+void touch_softlockup_watchdog(void)
+{
+        per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+/*
+ * This callback runs from the timer interrupt, and checks
+ * whether the watchdog thread has hung or not:
+ */
+void softlockup_tick(struct pt_regs *regs)
+{
+        int this_cpu = smp_processor_id();
+        unsigned long timestamp = per_cpu(timestamp, this_cpu);
+        if (per_cpu(print_timestamp, this_cpu) == timestamp)
+                return;
+        /* Do not cause a second panic when there already was one */
+        if (did_panic)
+                return;
+        if (time_after(jiffies, timestamp + 10*HZ)) {
+                per_cpu(print_timestamp, this_cpu) = timestamp;
+                spin_lock(&print_lock);
+                printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
+                        this_cpu);
+                show_regs(regs);
+                spin_unlock(&print_lock);
+        }
+}
+/*
+ * The watchdog thread - runs every second and touches the timestamp.
+ */
+static int watchdog(void * __bind_cpu)
+{
+        struct sched_param param = { .sched_priority = 99 };
+        int this_cpu = (long) __bind_cpu;
+        printk("softlockup thread %d started up.\n", this_cpu);
+        sched_setscheduler(current, SCHED_FIFO, &param);
+        current->flags |= PF_NOFREEZE;
+        set_current_state(TASK_INTERRUPTIBLE);
+        /*
+         * Run briefly once per second - if this gets delayed for
+         * more than 10 seconds then the debug-printout triggers
+         * in softlockup_tick():
+         */
+        while (!kthread_should_stop()) {
+                msleep_interruptible(1000);
+                touch_softlockup_watchdog();
+        }
+        __set_current_state(TASK_RUNNING);
+        return 0;
+}
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __devinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+        int hotcpu = (unsigned long)hcpu;
+        struct task_struct *p;
+        switch (action) {
+        case CPU_UP_PREPARE:
+                BUG_ON(per_cpu(watchdog_task, hotcpu));
+                p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
+                if (IS_ERR(p)) {
+                        printk("watchdog for %i failed\n", hotcpu);
+                        return NOTIFY_BAD;
+                }
+                per_cpu(watchdog_task, hotcpu) = p;
+                kthread_bind(p, hotcpu);
+                break;
+        case CPU_ONLINE:
+                wake_up_process(per_cpu(watchdog_task, hotcpu));
+                break;
+#ifdef CONFIG_HOTPLUG_CPU
+        case CPU_UP_CANCELED:
+                /* Unbind so it can run.  Fall thru. */
+                kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
+        case CPU_DEAD:
+                p = per_cpu(watchdog_task, hotcpu);
+                per_cpu(watchdog_task, hotcpu) = NULL;
+                kthread_stop(p);
+                break;
+#endif /* CONFIG_HOTPLUG_CPU */
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block __devinitdata cpu_nfb = {
+        .notifier_call = cpu_callback
+};
+__init void spawn_softlockup_task(void)
+{
+        void *cpu = (void *)(long)smp_processor_id();
+        cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+        register_cpu_notifier(&cpu_nfb);
+        notifier_chain_register(&panic_notifier_list, &panic_block);
+}
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0c3f9d8bbe17..0375fcd5921d 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -3,7 +3,10 @@
 *
 * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
 *
- * Copyright (2004) Ingo Molnar
+ * Copyright (2004, 2005) Ingo Molnar
+ *
+ * This file contains the spinlock/rwlock implementations for the
+ * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
 */
 #include <linux/config.h>
@@ -17,12 +20,12 @@
 * Generic declaration of the raw read_trylock() function,
 * architectures are supposed to optimize this:
 */
-int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
+int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
 {
-        _raw_read_lock(lock);
+        __raw_read_lock(lock);
        return 1;
 }
-EXPORT_SYMBOL(generic_raw_read_trylock);
+EXPORT_SYMBOL(generic__raw_read_trylock);
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
@@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock)
 }
 EXPORT_SYMBOL(_write_trylock);
-#ifndef CONFIG_PREEMPT
+#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
 void __lockfunc _read_lock(rwlock_t *lock)
 {
@@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
        local_irq_save(flags);
        preempt_disable();
-        _raw_spin_lock_flags(lock, flags);
+        _raw_spin_lock_flags(lock, &flags);
        return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave);
diff --git a/kernel/sys.c b/kernel/sys.c
index 0bcaed6560ac..2fa1ed18123c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -361,17 +361,35 @@ out_unlock:
        return retval;
 }
+/**
+ *      emergency_restart - reboot the system
+ *
+ *      Without shutting down any hardware or taking any locks
+ *      reboot the system.  This is called when we know we are in
+ *      trouble so this is our best effort to reboot.  This is
+ *      safe to call in interrupt context.
+ */
 void emergency_restart(void)
 {
        machine_emergency_restart();
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
-void kernel_restart(char *cmd)
+/**
+ *      kernel_restart - reboot the system
+ *
+ *      Shutdown everything and perform a clean reboot.
+ *      This is not safe to call in interrupt context.
+ */
+void kernel_restart_prepare(char *cmd)
 {
        notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
        system_state = SYSTEM_RESTART;
        device_shutdown();
+}
+void kernel_restart(char *cmd)
+{
+        kernel_restart_prepare(cmd);
        if (!cmd) {
                printk(KERN_EMERG "Restarting system.\n");
        } else {
@@ -382,6 +400,12 @@ void kernel_restart(char *cmd)
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
+/**
+ *      kernel_kexec - reboot the system
+ *
+ *      Move into place and start executing a preloaded standalone
+ *      executable.  If nothing was preloaded return an error.
+ */
 void kernel_kexec(void)
 {
 #ifdef CONFIG_KEXEC
@@ -390,9 +414,7 @@ void kernel_kexec(void)
        if (!image) {
                return;
        }
-        notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+        kernel_restart_prepare(NULL);
-        system_state = SYSTEM_RESTART;
-        device_shutdown();
        printk(KERN_EMERG "Starting new kernel\n");
        machine_shutdown();
        machine_kexec(image);
@@ -400,21 +422,39 @@ void kernel_kexec(void)
 }
 EXPORT_SYMBOL_GPL(kernel_kexec);
-void kernel_halt(void)
+/**
+ *      kernel_halt - halt the system
+ *
+ *      Shutdown everything and perform a clean system halt.
+ */
+void kernel_halt_prepare(void)
 {
        notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
        system_state = SYSTEM_HALT;
        device_shutdown();
+}
+void kernel_halt(void)
+{
+        kernel_halt_prepare();
        printk(KERN_EMERG "System halted.\n");
        machine_halt();
 }
 EXPORT_SYMBOL_GPL(kernel_halt);
-void kernel_power_off(void)
+/**
+ *      kernel_power_off - power_off the system
+ *
+ *      Shutdown everything and perform a clean system power_off.
+ */
+void kernel_power_off_prepare(void)
 {
        notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
        system_state = SYSTEM_POWER_OFF;
        device_shutdown();
+}
+void kernel_power_off(void)
+{
+        kernel_power_off_prepare();
        printk(KERN_EMERG "Power down.\n");
        machine_power_off();
 }
@@ -1711,7 +1751,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
                          unsigned long arg4, unsigned long arg5)
 {
        long error;
-        int sig;
        error = security_task_prctl(option, arg2, arg3, arg4, arg5);
        if (error)
@@ -1719,19 +1758,17 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
        switch (option) {
                case PR_SET_PDEATHSIG:
-                        sig = arg2;
+                        if (!valid_signal(arg2)) {
-                        if (!valid_signal(sig)) {
                                error = -EINVAL;
                                break;
                        }
-                        current->pdeath_signal = sig;
+                        current->pdeath_signal = arg2;
                        break;
                case PR_GET_PDEATHSIG:
                        error = put_user(current->pdeath_signal, (int __user *)arg2);
                        break;
                case PR_GET_DUMPABLE:
-                        if (current->mm->dumpable)
+                        error = current->mm->dumpable;
-                                error = 1;
                        break;
                case PR_SET_DUMPABLE:
                        if (arg2 < 0 || arg2 > 2) {
diff --git a/kernel/time.c b/kernel/time.c
index dd5ae1162a8f..40c2410ac99a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -570,6 +570,7 @@ void getnstimeofday(struct timespec *tv)
        tv->tv_sec = x.tv_sec;
        tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
 }
+EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
 #if (BITS_PER_LONG < 64)
diff --git a/kernel/timer.c b/kernel/timer.c
index 5377f40723ff..3ba10fa35b60 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
 {
        jiffies_64++;
        update_times();
+        softlockup_tick(regs);
 }
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -1150,9 +1151,26 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
 out:
        return timeout < 0 ? 0 : timeout;
 }
 EXPORT_SYMBOL(schedule_timeout);
+/*
+ * We can use __set_current_state() here because schedule_timeout() calls
+ * schedule() unconditionally.
+ */
+signed long __sched schedule_timeout_interruptible(signed long timeout)
+{
+       __set_current_state(TASK_INTERRUPTIBLE);
+       return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_interruptible);
+signed long __sched schedule_timeout_uninterruptible(signed long timeout)
+{
+       __set_current_state(TASK_UNINTERRUPTIBLE);
+       return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 /* Thread ID - the internal kernel "pid" */
 asmlinkage long sys_gettid(void)
 {
@@ -1169,8 +1187,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
        if (!time_after(expire, now))
                return 0;
-        current->state = TASK_INTERRUPTIBLE;
+        expire = schedule_timeout_interruptible(expire - now);
-        expire = schedule_timeout(expire - now);
        ret = 0;
        if (expire) {
@@ -1198,8 +1215,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
                return -EINVAL;
        expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
-        current->state = TASK_INTERRUPTIBLE;
+        expire = schedule_timeout_interruptible(expire);
-        expire = schedule_timeout(expire);
        ret = 0;
        if (expire) {
@@ -1428,7 +1444,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src)
        }
 }
-static inline u64 time_interpolator_get_counter(void)
+static inline u64 time_interpolator_get_counter(int writelock)
 {
        unsigned int src = time_interpolator->source;
@@ -1442,6 +1458,15 @@ static inline u64 time_interpolator_get_counter(void)
                        now = time_interpolator_get_cycles(src);
                        if (lcycle && time_after(lcycle, now))
                                return lcycle;
+                        /* When holding the xtime write lock, there's no need
+                         * to add the overhead of the cmpxchg.  Readers are
+                         * force to retry until the write lock is released.
+                         */
+                        if (writelock) {
+                                time_interpolator->last_cycle = now;
+                                return now;
+                        }
                        /* Keep track of the last timer value returned. The use of cmpxchg here
                         * will cause contention in an SMP environment.
                         */
@@ -1455,7 +1480,7 @@ static inline u64 time_interpolator_get_counter(void)
 void time_interpolator_reset(void)
 {
        time_interpolator->offset = 0;
-        time_interpolator->last_counter = time_interpolator_get_counter();
+        time_interpolator->last_counter = time_interpolator_get_counter(1);
 }
 #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
@@ -1467,7 +1492,7 @@ unsigned long time_interpolator_get_offset(void)
                return 0;
        return time_interpolator->offset +
-                GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator);
+                GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
 }
 #define INTERPOLATOR_ADJUST 65536
@@ -1490,7 +1515,7 @@ static void time_interpolator_update(long delta_nsec)
         * and the tuning logic insures that.
         */
-        counter = time_interpolator_get_counter();
+        counter = time_interpolator_get_counter(1);
        offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
        if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
@@ -1588,10 +1613,8 @@ void msleep(unsigned int msecs)
 {
        unsigned long timeout = msecs_to_jiffies(msecs) + 1;
-        while (timeout) {
+        while (timeout)
-                set_current_state(TASK_UNINTERRUPTIBLE);
+                timeout = schedule_timeout_uninterruptible(timeout);
-                timeout = schedule_timeout(timeout);
-        }
 }
 EXPORT_SYMBOL(msleep);
@@ -1604,10 +1627,8 @@ unsigned long msleep_interruptible(unsigned int msecs)
 {
        unsigned long timeout = msecs_to_jiffies(msecs) + 1;
-        while (timeout && !signal_pending(current)) {
+        while (timeout && !signal_pending(current))
-                set_current_state(TASK_INTERRUPTIBLE);
+                timeout = schedule_timeout_interruptible(timeout);
-                timeout = schedule_timeout(timeout);
-        }
        return jiffies_to_msecs(timeout);
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c7e36d4a70ca..91bacb13a7e2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -308,10 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name,
        struct workqueue_struct *wq;
        struct task_struct *p;
-        wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+        wq = kzalloc(sizeof(*wq), GFP_KERNEL);
        if (!wq)
                return NULL;
-        memset(wq, 0, sizeof(*wq));
        wq->name = name;
        /* We don't need the distraction of CPUs appearing and vanishing. */
@@ -499,7 +498,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        case CPU_UP_PREPARE:
                /* Create a new workqueue thread for it. */
                list_for_each_entry(wq, &workqueues, list) {
-                        if (create_workqueue_thread(wq, hotcpu) < 0) {
+                        if (!create_workqueue_thread(wq, hotcpu)) {
                                printk("workqueue for %i failed\n", hotcpu);
                                return NOTIFY_BAD;
                        }