30 files changed, 602 insertions, 316 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index d417ca1db79b..963fd15c9621 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -690,9 +690,7 @@ static const struct inotify_operations audit_inotify_ops = {
 /* Initialize audit support at boot time. */
 static int __init audit_init(void)
 {
-#ifdef CONFIG_AUDITSYSCALL
        int i;
-#endif
        printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
               audit_default ? "enabled" : "disabled");
@@ -717,10 +715,10 @@ static int __init audit_init(void)
        audit_ih = inotify_init(&audit_inotify_ops);
        if (IS_ERR(audit_ih))
                audit_panic("cannot initialize inotify handle");
+#endif
        for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
                INIT_LIST_HEAD(&audit_inode_hash[i]);
-#endif
        return 0;
 }
@@ -1030,6 +1028,9 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
        struct sk_buff *skb;
        static const unsigned char *hex = "0123456789ABCDEF";
+        if (!ab)
+                return;
        BUG_ON(!ab->skb);
        skb = ab->skb;
        avail = skb_tailroom(skb);
@@ -1062,6 +1063,9 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
        unsigned char *ptr;
        struct sk_buff *skb;
+        if (!ab)
+                return;
        BUG_ON(!ab->skb);
        skb = ab->skb;
        avail = skb_tailroom(skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index 6aa33b848cf2..a3370232a390 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -104,6 +104,7 @@ static inline int audit_hash_ino(u32 ino)
        return (ino & (AUDIT_INODE_BUCKETS-1));
 }
+extern int audit_match_class(int class, unsigned syscall);
 extern int audit_comparator(const u32 left, const u32 op, const u32 right);
 extern int audit_compare_dname_path(const char *dname, const char *path,
                                    int *dirlen);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 5b4e16276ca0..a44879b0c72f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -302,6 +302,15 @@ int __init audit_register_class(int class, unsigned *list)
        return 0;
 }
+int audit_match_class(int class, unsigned syscall)
+{
+        if (unlikely(syscall >= AUDIT_BITMASK_SIZE * sizeof(__u32)))
+                return 0;
+        if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class]))
+                return 0;
+        return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall);
+}
 /* Common user-space to kernel rule translation. */
 static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
 {
@@ -404,6 +413,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                case AUDIT_PERS:
                case AUDIT_ARCH:
                case AUDIT_MSGTYPE:
+                case AUDIT_PPID:
                case AUDIT_DEVMAJOR:
                case AUDIT_DEVMINOR:
                case AUDIT_EXIT:
@@ -413,6 +423,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                case AUDIT_ARG2:
                case AUDIT_ARG3:
                        break;
+                case AUDIT_PERM:
+                        if (f->val & ~15)
+                                goto exit_free;
+                        break;
                case AUDIT_INODE:
                        err = audit_to_inode(&entry->rule, f);
                        if (err)
@@ -442,6 +456,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                case AUDIT_EQUAL:
                        break;
                default:
+                        err = -EINVAL;
                        goto exit_free;
                }
        }
@@ -566,6 +581,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                        entry->rule.buflen += f->val;
                        entry->rule.filterkey = str;
                        break;
+                case AUDIT_PERM:
+                        if (f->val & ~15)
+                                goto exit_free;
+                        break;
                default:
                        goto exit_free;
                }
@@ -579,6 +598,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                case AUDIT_EQUAL:
                        break;
                default:
+                        err = -EINVAL;
                        goto exit_free;
                }
        }
@@ -911,7 +931,7 @@ static void audit_update_watch(struct audit_parent *parent,
                }
                ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
-                audit_log_format(ab, "audit updated rules specifying watch=");
+                audit_log_format(ab, "audit updated rules specifying path=");
                audit_log_untrustedstring(ab, owatch->path);
                audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
                audit_log_end(ab);
@@ -934,19 +954,28 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
        struct audit_watch *w, *nextw;
        struct audit_krule *r, *nextr;
        struct audit_entry *e;
+        struct audit_buffer *ab;
        mutex_lock(&audit_filter_mutex);
        parent->flags |= AUDIT_PARENT_INVALID;
        list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
                list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
                        e = container_of(r, struct audit_entry, rule);
+                        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+                        audit_log_format(ab, "audit implicitly removed rule path=");
+                        audit_log_untrustedstring(ab, w->path);
+                        if (r->filterkey) {
+                                audit_log_format(ab, " key=");
+                                audit_log_untrustedstring(ab, r->filterkey);
+                        } else
+                                audit_log_format(ab, " key=(null)");
+                        audit_log_format(ab, " list=%d", r->listnr);
+                        audit_log_end(ab);
                        list_del(&r->rlist);
                        list_del_rcu(&e->list);
                        call_rcu(&e->rcu, audit_free_rule_rcu);
-                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-                                 "audit implicitly removed rule from list=%d\n",
-                                  AUDIT_FILTER_EXIT);
                }
                audit_remove_watch(w);
        }
@@ -1134,6 +1163,14 @@ static inline int audit_add_rule(struct audit_entry *entry,
        struct audit_watch *watch = entry->rule.watch;
        struct nameidata *ndp, *ndw;
        int h, err, putnd_needed = 0;
+#ifdef CONFIG_AUDITSYSCALL
+        int dont_count = 0;
+        /* If either of these, don't count towards total */
+        if (entry->rule.listnr == AUDIT_FILTER_USER ||
+                entry->rule.listnr == AUDIT_FILTER_TYPE)
+                dont_count = 1;
+#endif
        if (inode_f) {
                h = audit_hash_ino(inode_f->val);
@@ -1174,6 +1211,10 @@ static inline int audit_add_rule(struct audit_entry *entry,
        } else {
                list_add_tail_rcu(&entry->list, list);
        }
+#ifdef CONFIG_AUDITSYSCALL
+        if (!dont_count)
+                audit_n_rules++;
+#endif
        mutex_unlock(&audit_filter_mutex);
        if (putnd_needed)
@@ -1198,6 +1239,14 @@ static inline int audit_del_rule(struct audit_entry *entry,
        struct audit_watch *watch, *tmp_watch = entry->rule.watch;
        LIST_HEAD(inotify_list);
        int h, ret = 0;
+#ifdef CONFIG_AUDITSYSCALL
+        int dont_count = 0;
+        /* If either of these, don't count towards total */
+        if (entry->rule.listnr == AUDIT_FILTER_USER ||
+                entry->rule.listnr == AUDIT_FILTER_TYPE)
+                dont_count = 1;
+#endif
        if (inode_f) {
                h = audit_hash_ino(inode_f->val);
@@ -1235,6 +1284,10 @@ static inline int audit_del_rule(struct audit_entry *entry,
        list_del_rcu(&e->list);
        call_rcu(&e->rcu, audit_free_rule_rcu);
+#ifdef CONFIG_AUDITSYSCALL
+        if (!dont_count)
+                audit_n_rules--;
+#endif
        mutex_unlock(&audit_filter_mutex);
        if (!list_empty(&inotify_list))
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ae40ac8c39e7..1bd8827a0102 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -85,6 +85,9 @@ extern int audit_enabled;
 /* Indicates that audit should log the full pathname. */
 #define AUDIT_NAME_FULL -1
+/* number of audit rules */
+int audit_n_rules;
 /* When fs/namei.c:getname() is called, we store the pointer in name and
 * we don't let putname() free it (instead we free all of the saved
 * pointers at syscall exit time).
@@ -174,6 +177,7 @@ struct audit_aux_data_path {
 /* The per-task audit context. */
 struct audit_context {
+        int                 dummy;      /* must be the first element */
        int                 in_syscall; /* 1 if task is in a syscall */
        enum audit_state    state;
        unsigned int        serial;     /* serial number for record */
@@ -205,6 +209,54 @@ struct audit_context {
 #endif
 };
+#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
+static inline int open_arg(int flags, int mask)
+{
+        int n = ACC_MODE(flags);
+        if (flags & (O_TRUNC | O_CREAT))
+                n |= AUDIT_PERM_WRITE;
+        return n & mask;
+}
+static int audit_match_perm(struct audit_context *ctx, int mask)
+{
+        unsigned n = ctx->major;
+        switch (audit_classify_syscall(ctx->arch, n)) {
+        case 0: /* native */
+                if ((mask & AUDIT_PERM_WRITE) &&
+                     audit_match_class(AUDIT_CLASS_WRITE, n))
+                        return 1;
+                if ((mask & AUDIT_PERM_READ) &&
+                     audit_match_class(AUDIT_CLASS_READ, n))
+                        return 1;
+                if ((mask & AUDIT_PERM_ATTR) &&
+                     audit_match_class(AUDIT_CLASS_CHATTR, n))
+                        return 1;
+                return 0;
+        case 1: /* 32bit on biarch */
+                if ((mask & AUDIT_PERM_WRITE) &&
+                     audit_match_class(AUDIT_CLASS_WRITE_32, n))
+                        return 1;
+                if ((mask & AUDIT_PERM_READ) &&
+                     audit_match_class(AUDIT_CLASS_READ_32, n))
+                        return 1;
+                if ((mask & AUDIT_PERM_ATTR) &&
+                     audit_match_class(AUDIT_CLASS_CHATTR_32, n))
+                        return 1;
+                return 0;
+        case 2: /* open */
+                return mask & ACC_MODE(ctx->argv[1]);
+        case 3: /* openat */
+                return mask & ACC_MODE(ctx->argv[2]);
+        case 4: /* socketcall */
+                return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
+        case 5: /* execve */
+                return mask & AUDIT_PERM_EXEC;
+        default:
+                return 0;
+        }
+}
 /* Determine if any context name data matches a rule's watch data */
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
 * otherwise. */
@@ -393,6 +445,9 @@ static int audit_filter_rules(struct task_struct *tsk,
                        /* ignore this field for filtering */
                        result = 1;
                        break;
+                case AUDIT_PERM:
+                        result = audit_match_perm(ctx, f->val);
+                        break;
                }
                if (!result)
@@ -514,7 +569,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
        context->return_valid = return_valid;
        context->return_code  = return_code;
-        if (context->in_syscall && !context->auditable) {
+        if (context->in_syscall && !context->dummy && !context->auditable) {
                enum audit_state state;
                state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
@@ -530,17 +585,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
        }
 get_context:
-        context->pid = tsk->pid;
-        context->ppid = sys_getppid();  /* sic.  tsk == current in all cases */
-        context->uid = tsk->uid;
-        context->gid = tsk->gid;
-        context->euid = tsk->euid;
-        context->suid = tsk->suid;
-        context->fsuid = tsk->fsuid;
-        context->egid = tsk->egid;
-        context->sgid = tsk->sgid;
-        context->fsgid = tsk->fsgid;
-        context->personality = tsk->personality;
        tsk->audit_context = NULL;
        return context;
 }
@@ -749,6 +794,17 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
        const char *tty;
        /* tsk == current */
+        context->pid = tsk->pid;
+        context->ppid = sys_getppid();  /* sic.  tsk == current in all cases */
+        context->uid = tsk->uid;
+        context->gid = tsk->gid;
+        context->euid = tsk->euid;
+        context->suid = tsk->suid;
+        context->fsuid = tsk->fsuid;
+        context->egid = tsk->egid;
+        context->sgid = tsk->sgid;
+        context->fsgid = tsk->fsgid;
+        context->personality = tsk->personality;
        ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
        if (!ab)
@@ -1066,7 +1122,8 @@ void audit_syscall_entry(int arch, int major,
        context->argv[3]    = a4;
        state = context->state;
-        if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
+        context->dummy = !audit_n_rules;
+        if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT))
                state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
        if (likely(state == AUDIT_DISABLED))
                return;
@@ -1199,14 +1256,18 @@ void audit_putname(const char *name)
 #endif
 }
-static void audit_inode_context(int idx, const struct inode *inode)
+/* Copy inode data into an audit_names. */
+static void audit_copy_inode(struct audit_names *name, const struct inode *inode)
 {
-        struct audit_context *context = current->audit_context;
+        name->ino   = inode->i_ino;
+        name->dev   = inode->i_sb->s_dev;
-        selinux_get_inode_sid(inode, &context->names[idx].osid);
+        name->mode  = inode->i_mode;
+        name->uid   = inode->i_uid;
+        name->gid   = inode->i_gid;
+        name->rdev  = inode->i_rdev;
+        selinux_get_inode_sid(inode, &name->osid);
 }
 /**
 * audit_inode - store the inode and device from a lookup
 * @name: name being audited
@@ -1240,20 +1301,14 @@ void __audit_inode(const char *name, const struct inode *inode)
                ++context->ino_count;
 #endif
        }
-        context->names[idx].ino   = inode->i_ino;
+        audit_copy_inode(&context->names[idx], inode);
-        context->names[idx].dev   = inode->i_sb->s_dev;
-        context->names[idx].mode  = inode->i_mode;
-        context->names[idx].uid   = inode->i_uid;
-        context->names[idx].gid   = inode->i_gid;
-        context->names[idx].rdev  = inode->i_rdev;
-        audit_inode_context(idx, inode);
 }
 /**
 * audit_inode_child - collect inode info for created/removed objects
 * @dname: inode's dentry name
 * @inode: inode being audited
- * @pino: inode number of dentry parent
+ * @parent: inode of dentry parent
 *
 * For syscalls that create or remove filesystem objects, audit_inode
 * can only collect information for the filesystem object's parent.
@@ -1264,7 +1319,7 @@ void __audit_inode(const char *name, const struct inode *inode)
 * unsuccessful attempts.
 */
 void __audit_inode_child(const char *dname, const struct inode *inode,
-                         unsigned long pino)
+                         const struct inode *parent)
 {
        int idx;
        struct audit_context *context = current->audit_context;
@@ -1278,7 +1333,7 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
        if (!dname)
                goto update_context;
        for (idx = 0; idx < context->name_count; idx++)
-                if (context->names[idx].ino == pino) {
+                if (context->names[idx].ino == parent->i_ino) {
                        const char *name = context->names[idx].name;
                        if (!name)
@@ -1302,16 +1357,47 @@ update_context:
        context->names[idx].name_len = AUDIT_NAME_FULL;
        context->names[idx].name_put = 0;       /* don't call __putname() */
-        if (inode) {
+        if (!inode)
-                context->names[idx].ino   = inode->i_ino;
+                context->names[idx].ino = (unsigned long)-1;
-                context->names[idx].dev   = inode->i_sb->s_dev;
+        else
-                context->names[idx].mode  = inode->i_mode;
+                audit_copy_inode(&context->names[idx], inode);
-                context->names[idx].uid   = inode->i_uid;
-                context->names[idx].gid   = inode->i_gid;
+        /* A parent was not found in audit_names, so copy the inode data for the
-                context->names[idx].rdev  = inode->i_rdev;
+         * provided parent. */
-                audit_inode_context(idx, inode);
+        if (!found_name) {
-        } else
+                idx = context->name_count++;
-                context->names[idx].ino   = (unsigned long)-1;
+#if AUDIT_DEBUG
+                context->ino_count++;
+#endif
+                audit_copy_inode(&context->names[idx], parent);
+        }
+}
+/**
+ * audit_inode_update - update inode info for last collected name
+ * @inode: inode being audited
+ *
+ * When open() is called on an existing object with the O_CREAT flag, the inode
+ * data audit initially collects is incorrect.  This additional hook ensures
+ * audit has the inode data for the actual object to be opened.
+ */
+void __audit_inode_update(const struct inode *inode)
+{
+        struct audit_context *context = current->audit_context;
+        int idx;
+        if (!context->in_syscall || !inode)
+                return;
+        if (context->name_count == 0) {
+                context->name_count++;
+#if AUDIT_DEBUG
+                context->ino_count++;
+#endif
+        }
+        idx = context->name_count - 1;
+        audit_copy_inode(&context->names[idx], inode);
 }
 /**
@@ -1642,7 +1728,7 @@ int audit_bprm(struct linux_binprm *bprm)
        unsigned long p, next;
        void *to;
-        if (likely(!audit_enabled || !context))
+        if (likely(!audit_enabled || !context || context->dummy))
                return 0;
        ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
@@ -1680,7 +1766,7 @@ int audit_socketcall(int nargs, unsigned long *args)
        struct audit_aux_data_socketcall *ax;
        struct audit_context *context = current->audit_context;
-        if (likely(!context))
+        if (likely(!context || context->dummy))
                return 0;
        ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
@@ -1708,7 +1794,7 @@ int audit_sockaddr(int len, void *a)
        struct audit_aux_data_sockaddr *ax;
        struct audit_context *context = current->audit_context;
-        if (likely(!context))
+        if (likely(!context || context->dummy))
                return 0;
        ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1a649f2bb9bb..4ea6f0dc2fc5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -816,6 +816,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
        struct cpuset trialcs;
        int retval, cpus_unchanged;
+        /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+        if (cs == &top_cpuset)
+                return -EACCES;
        trialcs = *cs;
        retval = cpulist_parse(buf, trialcs.cpus_allowed);
        if (retval < 0)
@@ -2033,6 +2037,33 @@ out:
        return err;
 }
+/*
+ * The top_cpuset tracks what CPUs and Memory Nodes are online,
+ * period.  This is necessary in order to make cpusets transparent
+ * (of no affect) on systems that are actively using CPU hotplug
+ * but making no active use of cpusets.
+ *
+ * This handles CPU hotplug (cpuhp) events.  If someday Memory
+ * Nodes can be hotplugged (dynamically changing node_online_map)
+ * then we should handle that too, perhaps in a similar way.
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+static int cpuset_handle_cpuhp(struct notifier_block *nb,
+                                unsigned long phase, void *cpu)
+{
+        mutex_lock(&manage_mutex);
+        mutex_lock(&callback_mutex);
+        top_cpuset.cpus_allowed = cpu_online_map;
+        mutex_unlock(&callback_mutex);
+        mutex_unlock(&manage_mutex);
+        return 0;
+}
+#endif
 /**
 * cpuset_init_smp - initialize cpus_allowed
 *
@@ -2043,6 +2074,8 @@ void __init cpuset_init_smp(void)
 {
        top_cpuset.cpus_allowed = cpu_online_map;
        top_cpuset.mems_allowed = node_online_map;
+        hotcpu_notifier(cpuset_handle_cpuhp, 0);
 }
 /**
@@ -2387,7 +2420,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
 {
        const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
-        int overlap = 0;                /* do cpusets overlap? */
+        int overlap = 1;                /* do cpusets overlap? */
        task_lock(current);
        if (current->flags & PF_EXITING) {
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index f05392d64267..36752f124c6a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,15 +19,15 @@
 #include <linux/sysctl.h>
 #include <linux/delayacct.h>
-int delayacct_on __read_mostly; /* Delay accounting turned on/off */
+int delayacct_on __read_mostly = 1;     /* Delay accounting turned on/off */
 kmem_cache_t *delayacct_cache;
-static int __init delayacct_setup_enable(char *str)
+static int __init delayacct_setup_disable(char *str)
 {
-        delayacct_on = 1;
+        delayacct_on = 0;
        return 1;
 }
-__setup("delayacct", delayacct_setup_enable);
+__setup("nodelayacct", delayacct_setup_disable);
 void delayacct_init(void)
 {
@@ -41,24 +41,11 @@ void delayacct_init(void)
 void __delayacct_tsk_init(struct task_struct *tsk)
 {
-        spin_lock_init(&tsk->delays_lock);
-        /* No need to acquire tsk->delays_lock for allocation here unless
-           __delayacct_tsk_init called after tsk is attached to tasklist
-        */
        tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
        if (tsk->delays)
                spin_lock_init(&tsk->delays->lock);
 }
-void __delayacct_tsk_exit(struct task_struct *tsk)
-{
-        struct task_delay_info *delays = tsk->delays;
-        spin_lock(&tsk->delays_lock);
-        tsk->delays = NULL;
-        spin_unlock(&tsk->delays_lock);
-        kmem_cache_free(delayacct_cache, delays);
-}
 /*
 * Start accounting for a delay statistic using
 * its starting timestamp (@start)
@@ -118,8 +105,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
        struct timespec ts;
        unsigned long t1,t2,t3;
-        spin_lock(&tsk->delays_lock);
        /* Though tsk->delays accessed later, early exit avoids
         * unnecessary returning of other data
         */
@@ -161,7 +146,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
        spin_unlock(&tsk->delays->lock);
 done:
-        spin_unlock(&tsk->delays_lock);
        return 0;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index dba194a8d416..d891883420f7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -908,7 +908,6 @@ fastcall NORET_TYPE void do_exit(long code)
                audit_free(tsk);
        taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
        taskstats_exit_free(tidstats);
-        delayacct_tsk_exit(tsk);
        exit_mm(tsk);
@@ -1054,7 +1053,7 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p)
         * Do not consider thread group leaders that are
         * in a non-empty thread group:
         */
-        if (current->tgid != p->tgid && delay_group_leader(p))
+        if (delay_group_leader(p))
                return 2;
        if (security_task_wait(p))
diff --git a/kernel/fork.c b/kernel/fork.c
index 1b0f7b1e0881..f9b014e3e700 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -117,6 +117,7 @@ void __put_task_struct(struct task_struct *tsk)
        security_task_free(tsk);
        free_uid(tsk->user);
        put_group_info(tsk->group_info);
+        delayacct_tsk_free(tsk);
        if (!profile_handoff_task(tsk))
                free_task(tsk);
@@ -1011,7 +1012,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        retval = -EFAULT;
        if (clone_flags & CLONE_PARENT_SETTID)
                if (put_user(p->pid, parent_tidptr))
-                        goto bad_fork_cleanup;
+                        goto bad_fork_cleanup_delays_binfmt;
        INIT_LIST_HEAD(&p->children);
        INIT_LIST_HEAD(&p->sibling);
@@ -1277,7 +1278,8 @@ bad_fork_cleanup_policy:
 bad_fork_cleanup_cpuset:
 #endif
        cpuset_exit(p);
-bad_fork_cleanup:
+bad_fork_cleanup_delays_binfmt:
+        delayacct_tsk_free(p);
        if (p->binfmt)
                module_put(p->binfmt->module);
 bad_fork_cleanup_put_domain:
@@ -1387,8 +1389,10 @@ long do_fork(unsigned long clone_flags,
                if (clone_flags & CLONE_VFORK) {
                        wait_for_completion(&vfork);
-                        if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
+                        if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
+                                current->ptrace_message = nr;
                                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
+                        }
                }
        } else {
                free_pid(pid);
diff --git a/kernel/futex.c b/kernel/futex.c
index cf0c8e21d1ab..9d260e838cff 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -297,7 +297,7 @@ static int futex_handle_fault(unsigned long address, int attempt)
        struct vm_area_struct * vma;
        struct mm_struct *mm = current->mm;
-        if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
+        if (attempt > 2 || !(vma = find_vma(mm, address)) ||
            vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
                return -EFAULT;
@@ -397,7 +397,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
                p = NULL;
                goto out_unlock;
        }
-        if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
+        if (p->exit_state != 0) {
                p = NULL;
                goto out_unlock;
        }
@@ -415,15 +415,15 @@ out_unlock:
 */
 void exit_pi_state_list(struct task_struct *curr)
 {
-        struct futex_hash_bucket *hb;
        struct list_head *next, *head = &curr->pi_state_list;
        struct futex_pi_state *pi_state;
+        struct futex_hash_bucket *hb;
        union futex_key key;
        /*
         * We are a ZOMBIE and nobody can enqueue itself on
         * pi_state_list anymore, but we have to be careful
-         * versus waiters unqueueing themselfs
+         * versus waiters unqueueing themselves:
         */
        spin_lock_irq(&curr->pi_lock);
        while (!list_empty(head)) {
@@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr)
                next = head->next;
                pi_state = list_entry(next, struct futex_pi_state, list);
                key = pi_state->key;
+                hb = hash_futex(&key);
                spin_unlock_irq(&curr->pi_lock);
-                hb = hash_futex(&key);
                spin_lock(&hb->lock);
                spin_lock_irq(&curr->pi_lock);
+                /*
+                 * We dropped the pi-lock, so re-check whether this
+                 * task still owns the PI-state:
+                 */
                if (head->next != next) {
                        spin_unlock(&hb->lock);
                        continue;
                }
-                list_del_init(&pi_state->list);
                WARN_ON(pi_state->owner != curr);
+                WARN_ON(list_empty(&pi_state->list));
+                list_del_init(&pi_state->list);
                pi_state->owner = NULL;
                spin_unlock_irq(&curr->pi_lock);
@@ -470,7 +473,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
        head = &hb->chain;
        list_for_each_entry_safe(this, next, head, list) {
-                if (match_futex (&this->key, &me->key)) {
+                if (match_futex(&this->key, &me->key)) {
                        /*
                         * Another waiter already exists - bump up
                         * the refcount and return its pi_state:
@@ -482,6 +485,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
                        if (unlikely(!pi_state))
                                return -EINVAL;
+                        WARN_ON(!atomic_read(&pi_state->refcount));
                        atomic_inc(&pi_state->refcount);
                        me->pi_state = pi_state;
@@ -490,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
        }
        /*
-         * We are the first waiter - try to look up the real owner and
+         * We are the first waiter - try to look up the real owner and attach
-         * attach the new pi_state to it:
+         * the new pi_state to it, but bail out when the owner died bit is set
+         * and TID = 0:
         */
        pid = uval & FUTEX_TID_MASK;
+        if (!pid && (uval & FUTEX_OWNER_DIED))
+                return -ESRCH;
        p = futex_find_get_task(pid);
        if (!p)
                return -ESRCH;
@@ -510,6 +518,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
        pi_state->key = me->key;
        spin_lock_irq(&p->pi_lock);
+        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &p->pi_state_list);
        pi_state->owner = p;
        spin_unlock_irq(&p->pi_lock);
@@ -573,20 +582,29 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
         * kept enabled while there is PI state around. We must also
         * preserve the owner died bit.)
         */
-        newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
+        if (!(uval & FUTEX_OWNER_DIED)) {
+                newval = FUTEX_WAITERS | new_owner->pid;
-        inc_preempt_count();
+                inc_preempt_count();
-        curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+                curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-        dec_preempt_count();
+                dec_preempt_count();
+                if (curval == -EFAULT)
+                        return -EFAULT;
+                if (curval != uval)
+                        return -EINVAL;
+        }
-        if (curval == -EFAULT)
+        spin_lock_irq(&pi_state->owner->pi_lock);
-                return -EFAULT;
+        WARN_ON(list_empty(&pi_state->list));
-        if (curval != uval)
+        list_del_init(&pi_state->list);
-                return -EINVAL;
+        spin_unlock_irq(&pi_state->owner->pi_lock);
-        list_del_init(&pi_state->owner->pi_state_list);
+        spin_lock_irq(&new_owner->pi_lock);
+        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &new_owner->pi_state_list);
        pi_state->owner = new_owner;
+        spin_unlock_irq(&new_owner->pi_lock);
        rt_mutex_unlock(&pi_state->pi_mutex);
        return 0;
@@ -729,8 +747,10 @@ retry:
                 */
                if (attempt++) {
                        if (futex_handle_fault((unsigned long)uaddr2,
-                                               attempt))
+                                                attempt)) {
+                                ret = -EFAULT;
                                goto out;
+                        }
                        goto retry;
                }
@@ -930,6 +950,7 @@ static int unqueue_me(struct futex_q *q)
        /* In the common case we don't take the spinlock, which is nice. */
 retry:
        lock_ptr = q->lock_ptr;
+        barrier();
        if (lock_ptr != 0) {
                spin_lock(lock_ptr);
                /*
@@ -1099,9 +1120,10 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
 * if there are waiters then it will block, it does PI, etc. (Due to
 * races the kernel might see a 0 value of the futex too.)
 */
-static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
+static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
-                            struct hrtimer_sleeper *to)
+                         long nsec, int trylock)
 {
+        struct hrtimer_sleeper timeout, *to = NULL;
        struct task_struct *curr = current;
        struct futex_hash_bucket *hb;
        u32 uval, newval, curval;
@@ -1111,6 +1133,13 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
        if (refill_pi_state_cache())
                return -ENOMEM;
+        if (sec != MAX_SCHEDULE_TIMEOUT) {
+                to = &timeout;
+                hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+                hrtimer_init_sleeper(to, current);
+                to->timer.expires = ktime_set(sec, nsec);
+        }
        q.pi_state = NULL;
 retry:
        down_read(&curr->mm->mmap_sem);
@@ -1236,6 +1265,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
                /* Owner died? */
                if (q.pi_state->owner != NULL) {
                        spin_lock_irq(&q.pi_state->owner->pi_lock);
+                        WARN_ON(list_empty(&q.pi_state->list));
                        list_del_init(&q.pi_state->list);
                        spin_unlock_irq(&q.pi_state->owner->pi_lock);
                } else
@@ -1244,6 +1274,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
                q.pi_state->owner = current;
                spin_lock_irq(&current->pi_lock);
+                WARN_ON(!list_empty(&q.pi_state->list));
                list_add(&q.pi_state->list, &current->pi_state_list);
                spin_unlock_irq(&current->pi_lock);
@@ -1284,7 +1315,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
        if (!detect && ret == -EDEADLK && 0)
                force_sig(SIGKILL, current);
-        return ret;
+        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 out_unlock_release_sem:
        queue_unlock(&q, hb);
@@ -1301,9 +1332,10 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
         * still holding the mmap_sem.
         */
        if (attempt++) {
-                if (futex_handle_fault((unsigned long)uaddr, attempt))
+                if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+                        ret = -EFAULT;
                        goto out_unlock_release_sem;
+                }
                goto retry_locked;
        }
@@ -1318,76 +1350,6 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
 }
 /*
- * Restart handler
- */
-static long futex_lock_pi_restart(struct restart_block *restart)
-{
-        struct hrtimer_sleeper timeout, *to = NULL;
-        int ret;
-        restart->fn = do_no_restart_syscall;
-        if (restart->arg2 || restart->arg3) {
-                to = &timeout;
-                hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
-                hrtimer_init_sleeper(to, current);
-                to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
-                        (u64) restart->arg0;
-        }
-        pr_debug("lock_pi restart: %p, %d (%d)\n",
-                 (u32 __user *)restart->arg0, current->pid);
-        ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
-                               0, to);
-        if (ret != -EINTR)
-                return ret;
-        restart->fn = futex_lock_pi_restart;
-        /* The other values are filled in */
-        return -ERESTART_RESTARTBLOCK;
-}
-/*
- * Called from the syscall entry below.
- */
-static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
-                         long nsec, int trylock)
-{
-        struct hrtimer_sleeper timeout, *to = NULL;
-        struct restart_block *restart;
-        int ret;
-        if (sec != MAX_SCHEDULE_TIMEOUT) {
-                to = &timeout;
-                hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
-                hrtimer_init_sleeper(to, current);
-                to->timer.expires = ktime_set(sec, nsec);
-        }
-        ret = do_futex_lock_pi(uaddr, detect, trylock, to);
-        if (ret != -EINTR)
-                return ret;
-        pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
-        restart = &current_thread_info()->restart_block;
-        restart->fn = futex_lock_pi_restart;
-        restart->arg0 = (unsigned long) uaddr;
-        restart->arg1 = detect;
-        if (to) {
-                restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
-                restart->arg3 = to->timer.expires.tv64 >> 32;
-        } else
-                restart->arg2 = restart->arg3 = 0;
-        return -ERESTART_RESTARTBLOCK;
-}
-/*
 * Userspace attempted a TID -> 0 atomic transition, and failed.
 * This is the in-kernel slowpath: we look up the PI state (if any),
 * and do the rt-mutex unlock.
@@ -1427,9 +1389,11 @@ retry_locked:
         * again. If it succeeds then we can return without waking
         * anyone else up:
         */
-        inc_preempt_count();
+        if (!(uval & FUTEX_OWNER_DIED)) {
-        uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+                inc_preempt_count();
-        dec_preempt_count();
+                uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+                dec_preempt_count();
+        }
        if (unlikely(uval == -EFAULT))
                goto pi_faulted;
@@ -1462,9 +1426,11 @@ retry_locked:
        /*
         * No waiters - kernel unlocks the futex:
         */
-        ret = unlock_futex_pi(uaddr, uval);
+        if (!(uval & FUTEX_OWNER_DIED)) {
-        if (ret == -EFAULT)
+                ret = unlock_futex_pi(uaddr, uval);
-                goto pi_faulted;
+                if (ret == -EFAULT)
+                        goto pi_faulted;
+        }
 out_unlock:
        spin_unlock(&hb->lock);
@@ -1481,9 +1447,10 @@ pi_faulted:
         * still holding the mmap_sem.
         */
        if (attempt++) {
-                if (futex_handle_fault((unsigned long)uaddr, attempt))
+                if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+                        ret = -EFAULT;
                        goto out_unlock;
+                }
                goto retry_locked;
        }
@@ -1683,9 +1650,9 @@ err_unlock:
 * Process a futex-list entry, check whether it's owned by the
 * dying task, and do notification if so:
 */
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
-        u32 uval, nval;
+        u32 uval, nval, mval;
 retry:
        if (get_user(uval, uaddr))
@@ -1702,21 +1669,45 @@ retry:
                 * thread-death.) The rest of the cleanup is done in
                 * userspace.
                 */
-                nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
+                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-                                                     uval | FUTEX_OWNER_DIED);
+                nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
                if (nval == -EFAULT)
                        return -1;
                if (nval != uval)
                        goto retry;
-                if (uval & FUTEX_WAITERS)
+                /*
-                        futex_wake(uaddr, 1);
+                 * Wake robust non-PI futexes here. The wakeup of
+                 * PI futexes happens in exit_pi_state():
+                 */
+                if (!pi) {
+                        if (uval & FUTEX_WAITERS)
+                                futex_wake(uaddr, 1);
+                }
        }
        return 0;
 }
 /*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+                                     struct robust_list __user **head, int *pi)
+{
+        unsigned long uentry;
+        if (get_user(uentry, (unsigned long *)head))
+                return -EFAULT;
+        *entry = (void *)(uentry & ~1UL);
+        *pi = uentry & 1;
+        return 0;
+}
+/*
 * Walk curr->robust_list (very carefully, it's a userspace list!)
 * and mark any locks found there dead, and notify any waiters.
 *
@@ -1726,14 +1717,14 @@ void exit_robust_list(struct task_struct *curr)
 {
        struct robust_list_head __user *head = curr->robust_list;
        struct robust_list __user *entry, *pending;
-        unsigned int limit = ROBUST_LIST_LIMIT;
+        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
        unsigned long futex_offset;
        /*
         * Fetch the list head (which was registered earlier, via
         * sys_set_robust_list()):
         */
-        if (get_user(entry, &head->list.next))
+        if (fetch_robust_entry(&entry, &head->list.next, &pi))
                return;
        /*
         * Fetch the relative futex offset:
@@ -1744,10 +1735,11 @@ void exit_robust_list(struct task_struct *curr)
         * Fetch any possibly pending lock-add first, and handle it
         * if it exists:
         */
-        if (get_user(pending, &head->list_op_pending))
+        if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
                return;
        if (pending)
-                handle_futex_death((void *)pending + futex_offset, curr);
+                handle_futex_death((void *)pending + futex_offset, curr, pip);
        while (entry != &head->list) {
                /*
@@ -1756,12 +1748,12 @@ void exit_robust_list(struct task_struct *curr)
                 */
                if (entry != pending)
                        if (handle_futex_death((void *)entry + futex_offset,
-                                                curr))
+                                                curr, pi))
                                return;
                /*
                 * Fetch the next entry in the list:
                 */
-                if (get_user(entry, &entry->next))
+                if (fetch_robust_entry(&entry, &entry->next, &pi))
                        return;
                /*
                 * Avoid excessively long or circular lists:
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d1d92b441fb7..c5cca3f65cb7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -12,6 +12,23 @@
 #include <asm/uaccess.h>
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+                   compat_uptr_t *head, int *pi)
+{
+        if (get_user(*uentry, head))
+                return -EFAULT;
+        *entry = compat_ptr((*uentry) & ~1);
+        *pi = (unsigned int)(*uentry) & 1;
+        return 0;
+}
 /*
 * Walk curr->robust_list (very carefully, it's a userspace list!)
 * and mark any locks found there dead, and notify any waiters.
@@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr)
 {
        struct compat_robust_list_head __user *head = curr->compat_robust_list;
        struct robust_list __user *entry, *pending;
+        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
        compat_uptr_t uentry, upending;
-        unsigned int limit = ROBUST_LIST_LIMIT;
        compat_long_t futex_offset;
        /*
         * Fetch the list head (which was registered earlier, via
         * sys_set_robust_list()):
         */
-        if (get_user(uentry, &head->list.next))
+        if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
                return;
-        entry = compat_ptr(uentry);
        /*
         * Fetch the relative futex offset:
         */
@@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr)
         * Fetch any possibly pending lock-add first, and handle it
         * if it exists:
         */
-        if (get_user(upending, &head->list_op_pending))
+        if (fetch_robust_entry(&upending, &pending,
+                               &head->list_op_pending, &pip))
                return;
-        pending = compat_ptr(upending);
        if (upending)
-                handle_futex_death((void *)pending + futex_offset, curr);
+                handle_futex_death((void *)pending + futex_offset, curr, pip);
        while (compat_ptr(uentry) != &head->list) {
                /*
@@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr)
                 */
                if (entry != pending)
                        if (handle_futex_death((void *)entry + futex_offset,
-                                                curr))
+                                                curr, pi))
                                return;
                /*
                 * Fetch the next entry in the list:
                 */
-                if (get_user(uentry, (compat_uptr_t *)&entry->next))
+                if (fetch_robust_entry(&uentry, &entry,
+                                       (compat_uptr_t *)&entry->next, &pi))
                        return;
-                entry = compat_ptr(uentry);
                /*
                 * Avoid excessively long or circular lists:
                 */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index d17766d40dab..21c38a7e666b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -187,7 +187,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
 {
        struct hrtimer_base *new_base;
-        new_base = &__get_cpu_var(hrtimer_bases[base->index]);
+        new_base = &__get_cpu_var(hrtimer_bases)[base->index];
        if (base != new_base) {
                /*
@@ -835,7 +835,7 @@ static void migrate_hrtimers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
+static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
                                        unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -859,7 +859,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata hrtimers_nb = {
+static struct notifier_block __cpuinitdata hrtimers_nb = {
        .notifier_call = hrtimer_cpu_notify,
 };
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index fc4e906aedbd..48a53f68af96 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -20,6 +20,11 @@
 /**
 * handle_bad_irq - handle spurious and unhandled irqs
+ * @irq:       the interrupt number
+ * @desc:      description of the interrupt
+ * @regs:      pointer to a register structure
+ *
+ * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
 */
 void fastcall
 handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4e461438e48b..92be519eff26 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -137,16 +137,40 @@ EXPORT_SYMBOL(enable_irq);
 *      @irq:   interrupt to control
 *      @on:    enable/disable power management wakeup
 *
- *      Enable/disable power management wakeup mode
+ *      Enable/disable power management wakeup mode, which is
+ *      disabled by default.  Enables and disables must match,
+ *      just as they match for non-wakeup mode support.
+ *
+ *      Wakeup mode lets this IRQ wake the system from sleep
+ *      states like "suspend to RAM".
 */
 int set_irq_wake(unsigned int irq, unsigned int on)
 {
        struct irq_desc *desc = irq_desc + irq;
        unsigned long flags;
        int ret = -ENXIO;
+        int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
+        /* wakeup-capable irqs can be shared between drivers that
+         * don't need to have the same sleep mode behaviors.
+         */
        spin_lock_irqsave(&desc->lock, flags);
-        if (desc->chip->set_wake)
+        if (on) {
+                if (desc->wake_depth++ == 0)
+                        desc->status |= IRQ_WAKEUP;
+                else
+                        set_wake = NULL;
+        } else {
+                if (desc->wake_depth == 0) {
+                        printk(KERN_WARNING "Unbalanced IRQ %d "
+                                        "wake disable\n", irq);
+                        WARN_ON(1);
+                } else if (--desc->wake_depth == 0)
+                        desc->status &= ~IRQ_WAKEUP;
+                else
+                        set_wake = NULL;
+        }
+        if (set_wake)
                ret = desc->chip->set_wake(irq, on);
        spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 64aab081153b..3f57dfdc8f92 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -393,6 +393,7 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
 static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 {
        copy_kprobe(p, ap);
+        flush_insn_slot(ap);
        ap->addr = p->addr;
        ap->pre_handler = aggr_pre_handler;
        ap->fault_handler = aggr_fault_handler;
diff --git a/kernel/panic.c b/kernel/panic.c
index d8a0bca21233..8010b9b17aca 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/nmi.h>
 #include <linux/kexec.h>
+#include <linux/debug_locks.h>
 int panic_on_oops;
 int tainted;
@@ -172,7 +173,7 @@ const char *print_tainted(void)
 void add_taint(unsigned flag)
 {
-        debug_locks_off(); /* can't trust the integrity of the kernel anymore */
+        debug_locks = 0; /* can't trust the integrity of the kernel anymore */
        tainted |= flag;
 }
 EXPORT_SYMBOL(add_taint);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ae44a70aae8a..619ecabf7c58 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -56,7 +56,7 @@ config PM_TRACE
 config SOFTWARE_SUSPEND
        bool "Software Suspend"
-        depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
+        depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP))
        ---help---
          Enable the possibility of suspending the machine.
          It doesn't need ACPI or APM.
@@ -78,6 +78,10 @@ config SOFTWARE_SUSPEND
          For more information take a look at <file:Documentation/power/swsusp.txt>.
+          (For now, swsusp is incompatible with PAE aka HIGHMEM_64G on i386.
+          we need identity mapping for resume to work, and that is trivial
+          to get with 4MB pages, but less than trivial on PAE).
 config PM_STD_PARTITION
        string "Default resume partition"
        depends on SOFTWARE_SUSPEND
diff --git a/kernel/power/process.c b/kernel/power/process.c
index b2a5f671d6cd..72e72d2c61e6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -66,13 +66,25 @@ static inline void freeze_process(struct task_struct *p)
        }
 }
+static void cancel_freezing(struct task_struct *p)
+{
+        unsigned long flags;
+        if (freezing(p)) {
+                pr_debug("  clean up: %s\n", p->comm);
+                do_not_freeze(p);
+                spin_lock_irqsave(&p->sighand->siglock, flags);
+                recalc_sigpending_tsk(p);
+                spin_unlock_irqrestore(&p->sighand->siglock, flags);
+        }
+}
 /* 0 = success, else # of processes that we failed to stop */
 int freeze_processes(void)
 {
        int todo, nr_user, user_frozen;
        unsigned long start_time;
        struct task_struct *g, *p;
-        unsigned long flags;
        printk( "Stopping tasks: " );
        start_time = jiffies;
@@ -85,6 +97,10 @@ int freeze_processes(void)
                                continue;
                        if (frozen(p))
                                continue;
+                        if (p->state == TASK_TRACED && frozen(p->parent)) {
+                                cancel_freezing(p);
+                                continue;
+                        }
                        if (p->mm && !(p->flags & PF_BORROWED_MM)) {
                                /* The task is a user-space one.
                                 * Freeze it unless there's a vfork completion
@@ -126,13 +142,7 @@ int freeze_processes(void)
                do_each_thread(g, p) {
                        if (freezeable(p) && !frozen(p))
                                printk(KERN_ERR "  %s\n", p->comm);
-                        if (freezing(p)) {
+                        cancel_freezing(p);
-                                pr_debug("  clean up: %s\n", p->comm);
-                                p->flags &= ~PF_FREEZE;
-                                spin_lock_irqsave(&p->sighand->siglock, flags);
-                                recalc_sigpending_tsk(p);
-                                spin_unlock_irqrestore(&p->sighand->siglock, flags);
-                        }
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
                return todo;
diff --git a/kernel/printk.c b/kernel/printk.c
index 65ca0688f86f..1149365e989e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -799,6 +799,9 @@ void release_console_sem(void)
                up(&secondary_console_sem);
                return;
        }
+        console_may_schedule = 0;
        for ( ; ; ) {
                spin_lock_irqsave(&logbuf_lock, flags);
                wake_klogd |= log_start - log_end;
@@ -812,7 +815,6 @@ void release_console_sem(void)
                local_irq_restore(flags);
        }
        console_locked = 0;
-        console_may_schedule = 0;
        up(&console_sem);
        spin_unlock_irqrestore(&logbuf_lock, flags);
        if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) {
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 759805c9859a..436ab35f6fa7 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -548,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu)
        tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
 }
-static int __devinit rcu_cpu_notify(struct notifier_block *self,
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -565,7 +565,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata rcu_nb = {
+static struct notifier_block __cpuinitdata rcu_nb = {
        .notifier_call  = rcu_cpu_notify,
 };
diff --git a/kernel/resource.c b/kernel/resource.c
index 0dd3a857579e..46286434af80 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -244,6 +244,7 @@ int find_next_system_ram(struct resource *res)
        start = res->start;
        end = res->end;
+        BUG_ON(start >= end);
        read_lock(&resource_lock);
        for (p = iomem_resource.child; p ; p = p->sibling) {
@@ -254,15 +255,17 @@ int find_next_system_ram(struct resource *res)
                        p = NULL;
                        break;
                }
-                if (p->start >= start)
+                if ((p->end >= start) && (p->start < end))
                        break;
        }
        read_unlock(&resource_lock);
        if (!p)
                return -1;
        /* copy data */
-        res->start = p->start;
+        if (res->start < p->start)
-        res->end = p->end;
+                res->start = p->start;
+        if (res->end > p->end)
+                res->end = p->end;
        return 0;
 }
 #endif
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index d2ef13b485e7..3e13a1e5856f 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -7,6 +7,8 @@
 *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
 *  Copyright (C) 2006 Esben Nielsen
+ *
+ *  See Documentation/rt-mutex-design.txt for details.
 */
 #include <linux/spinlock.h>
 #include <linux/module.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index b44b9a43b0fc..a234fbee1238 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4162,10 +4162,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
                read_unlock_irq(&tasklist_lock);
                return -ESRCH;
        }
-        get_task_struct(p);
-        read_unlock_irq(&tasklist_lock);
        retval = sched_setscheduler(p, policy, &lparam);
-        put_task_struct(p);
+        read_unlock_irq(&tasklist_lock);
        return retval;
 }
@@ -4456,9 +4454,9 @@ asmlinkage long sys_sched_yield(void)
        return 0;
 }
-static inline int __resched_legal(void)
+static inline int __resched_legal(int expected_preempt_count)
 {
-        if (unlikely(preempt_count()))
+        if (unlikely(preempt_count() != expected_preempt_count))
                return 0;
        if (unlikely(system_state != SYSTEM_RUNNING))
                return 0;
@@ -4484,7 +4482,7 @@ static void __cond_resched(void)
 int __sched cond_resched(void)
 {
-        if (need_resched() && __resched_legal()) {
+        if (need_resched() && __resched_legal(0)) {
                __cond_resched();
                return 1;
        }
@@ -4510,7 +4508,7 @@ int cond_resched_lock(spinlock_t *lock)
                ret = 1;
                spin_lock(lock);
        }
-        if (need_resched() && __resched_legal()) {
+        if (need_resched() && __resched_legal(1)) {
                spin_release(&lock->dep_map, 1, _THIS_IP_);
                _raw_spin_unlock(lock);
                preempt_enable_no_resched();
@@ -4526,7 +4524,7 @@ int __sched cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
-        if (need_resched() && __resched_legal()) {
+        if (need_resched() && __resched_legal(0)) {
                raw_local_irq_disable();
                _local_bh_enable();
                raw_local_irq_enable();
@@ -6494,7 +6492,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
        for (i = 0; i < MAX_NUMNODES; i++)
                init_numa_sched_groups_power(sched_group_nodes[i]);
-        init_numa_sched_groups_power(sched_group_allnodes);
+        if (sched_group_allnodes) {
+                int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
+                struct sched_group *sg = &sched_group_allnodes[group];
+                init_numa_sched_groups_power(sg);
+        }
 #endif
        /* Attach the domains */
@@ -6761,6 +6764,11 @@ void __init sched_init(void)
        }
        set_load_weight(&init_task);
+#ifdef CONFIG_RT_MUTEXES
+        plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
+#endif
        /*
         * The boot idle thread does lazy MMU switching as well:
         */
diff --git a/kernel/signal.c b/kernel/signal.c
index 7fe874d12fae..bfdb5686fa3e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -791,22 +791,31 @@ out:
 /*
 * Force a signal that the process can't ignore: if necessary
 * we unblock the signal and change any SIG_IGN to SIG_DFL.
+ *
+ * Note: If we unblock the signal, we always reset it to SIG_DFL,
+ * since we do not want to have a signal handler that was blocked
+ * be invoked when user space had explicitly blocked it.
+ *
+ * We don't want to have recursive SIGSEGV's etc, for example.
 */
 int
 force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
        unsigned long int flags;
-        int ret;
+        int ret, blocked, ignored;
+        struct k_sigaction *action;
        spin_lock_irqsave(&t->sighand->siglock, flags);
-        if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
+        action = &t->sighand->action[sig-1];
-                t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
+        ignored = action->sa.sa_handler == SIG_IGN;
-        }
+        blocked = sigismember(&t->blocked, sig);
-        if (sigismember(&t->blocked, sig)) {
+        if (blocked || ignored) {
-                sigdelset(&t->blocked, sig);
+                action->sa.sa_handler = SIG_DFL;
+                if (blocked) {
+                        sigdelset(&t->blocked, sig);
+                        recalc_sigpending_tsk(t);
+                }
        }
-        recalc_sigpending_tsk(t);
        ret = specific_send_sig_info(sig, info, t);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0f08a84ae307..3789ca98197c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -65,6 +65,7 @@ static inline void wakeup_softirqd(void)
 * This one is for softirq.c-internal use,
 * where hardirqs are disabled legitimately:
 */
+#ifdef CONFIG_TRACE_IRQFLAGS
 static void __local_bh_disable(unsigned long ip)
 {
        unsigned long flags;
@@ -80,6 +81,13 @@ static void __local_bh_disable(unsigned long ip)
                trace_softirqs_off(ip);
        raw_local_irq_restore(flags);
 }
+#else /* !CONFIG_TRACE_IRQFLAGS */
+static inline void __local_bh_disable(unsigned long ip)
+{
+        add_preempt_count(SOFTIRQ_OFFSET);
+        barrier();
+}
+#endif /* CONFIG_TRACE_IRQFLAGS */
 void local_bh_disable(void)
 {
@@ -121,12 +129,16 @@ EXPORT_SYMBOL(_local_bh_enable);
 void local_bh_enable(void)
 {
+#ifdef CONFIG_TRACE_IRQFLAGS
        unsigned long flags;
        WARN_ON_ONCE(in_irq());
+#endif
        WARN_ON_ONCE(irqs_disabled());
+#ifdef CONFIG_TRACE_IRQFLAGS
        local_irq_save(flags);
+#endif
        /*
         * Are softirqs going to be turned on now:
         */
@@ -142,18 +154,22 @@ void local_bh_enable(void)
                do_softirq();
        dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
        local_irq_restore(flags);
+#endif
        preempt_check_resched();
 }
 EXPORT_SYMBOL(local_bh_enable);
 void local_bh_enable_ip(unsigned long ip)
 {
+#ifdef CONFIG_TRACE_IRQFLAGS
        unsigned long flags;
        WARN_ON_ONCE(in_irq());
        local_irq_save(flags);
+#endif
        /*
         * Are softirqs going to be turned on now:
         */
@@ -169,7 +185,9 @@ void local_bh_enable_ip(unsigned long ip)
                do_softirq();
        dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
        local_irq_restore(flags);
+#endif
        preempt_check_resched();
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
@@ -547,7 +565,7 @@ static void takeover_tasklets(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cpu_callback(struct notifier_block *nfb,
                                  unsigned long action,
                                  void *hcpu)
 {
@@ -587,7 +605,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 6b76caa22981..03e6a2b0b787 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
 /*
 * Create/destroy watchdog threads as CPUs come and go:
 */
-static int __devinit
+static int __cpuinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int hotcpu = (unsigned long)hcpu;
@@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index bfd6ad9c0330..fb524b009eef 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -72,7 +72,7 @@ EXPORT_SYMBOL(_write_trylock);
 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
 */
 #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \
-        defined(CONFIG_PROVE_LOCKING)
+        defined(CONFIG_DEBUG_LOCK_ALLOC)
 void __lockfunc _read_lock(rwlock_t *lock)
 {
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index dcfb5d731466..51cacd111dbd 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -111,7 +111,6 @@ static int stop_machine(void)
        /* If some failed, kill them all. */
        if (ret < 0) {
                stopmachine_set_state(STOPMACHINE_EXIT);
-                up(&stopmachine_mutex);
                return ret;
        }
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index f45179ce028e..e78187657330 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -121,46 +121,45 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
 /*
 * Send taskstats data in @skb to listeners registered for @cpu's exit data
 */
-static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 {
        struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
        struct listener_list *listeners;
        struct listener *s, *tmp;
        struct sk_buff *skb_next, *skb_cur = skb;
        void *reply = genlmsg_data(genlhdr);
-        int rc, ret, delcount = 0;
+        int rc, delcount = 0;
        rc = genlmsg_end(skb, reply);
        if (rc < 0) {
                nlmsg_free(skb);
-                return rc;
+                return;
        }
        rc = 0;
        listeners = &per_cpu(listener_array, cpu);
        down_read(&listeners->sem);
-        list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+        list_for_each_entry(s, &listeners->list, list) {
                skb_next = NULL;
                if (!list_is_last(&s->list, &listeners->list)) {
                        skb_next = skb_clone(skb_cur, GFP_KERNEL);
-                        if (!skb_next) {
+                        if (!skb_next)
-                                nlmsg_free(skb_cur);
-                                rc = -ENOMEM;
                                break;
-                        }
                }
-                ret = genlmsg_unicast(skb_cur, s->pid);
+                rc = genlmsg_unicast(skb_cur, s->pid);
-                if (ret == -ECONNREFUSED) {
+                if (rc == -ECONNREFUSED) {
                        s->valid = 0;
                        delcount++;
-                        rc = ret;
                }
                skb_cur = skb_next;
        }
        up_read(&listeners->sem);
+        if (skb_cur)
+                nlmsg_free(skb_cur);
        if (!delcount)
-                return rc;
+                return;
        /* Delete invalidated entries */
        down_write(&listeners->sem);
@@ -171,13 +170,12 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
                }
        }
        up_write(&listeners->sem);
-        return rc;
 }
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
                struct taskstats *stats)
 {
-        int rc;
+        int rc = 0;
        struct task_struct *tsk = pidtsk;
        if (!pidtsk) {
@@ -196,12 +194,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
         * Each accounting subsystem adds calls to its functions to
         * fill in relevant parts of struct taskstsats as follows
         *
-         *      rc = per-task-foo(stats, tsk);
+         *      per-task-foo(stats, tsk);
-         *      if (rc)
-         *              goto err;
         */
-        rc = delayacct_add_tsk(stats, tsk);
+        delayacct_add_tsk(stats, tsk);
        stats->version = TASKSTATS_VERSION;
        /* Define err: label here if needed */
diff --git a/kernel/timer.c b/kernel/timer.c
index 05809c2e2fd6..1d7dd6267c2d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t;
 tvec_base_t boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
 static inline void set_running_timer(tvec_base_t *base,
                                        struct timer_list *timer)
@@ -408,7 +408,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 * This function cascades all vectors and executes all expired timer
 * vectors.
 */
-#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 static inline void __run_timers(tvec_base_t *base)
 {
@@ -1324,46 +1324,19 @@ asmlinkage long sys_getpid(void)
 }
 /*
- * Accessing ->group_leader->real_parent is not SMP-safe, it could
+ * Accessing ->real_parent is not SMP-safe, it could
- * change from under us. However, rather than getting any lock
+ * change from under us. However, we can use a stale
- * we can use an optimistic algorithm: get the parent
+ * value of ->real_parent under rcu_read_lock(), see
- * pid, and go back and check that the parent is still
+ * release_task()->call_rcu(delayed_put_task_struct).
- * the same. If it has changed (which is extremely unlikely
- * indeed), we just try again..
- *
- * NOTE! This depends on the fact that even if we _do_
- * get an old value of "parent", we can happily dereference
- * the pointer (it was and remains a dereferencable kernel pointer
- * no matter what): we just can't necessarily trust the result
- * until we know that the parent pointer is valid.
- *
- * NOTE2: ->group_leader never changes from under us.
 */
 asmlinkage long sys_getppid(void)
 {
        int pid;
-        struct task_struct *me = current;
-        struct task_struct *parent;
-        parent = me->group_leader->real_parent;
+        rcu_read_lock();
-        for (;;) {
+        pid = rcu_dereference(current->real_parent)->tgid;
-                pid = parent->tgid;
+        rcu_read_unlock();
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-{
-                struct task_struct *old = parent;
-                /*
-                 * Make sure we read the pid before re-reading the
-                 * parent pointer:
-                 */
-                smp_rmb();
-                parent = me->group_leader->real_parent;
-                if (old != parent)
-                        continue;
-}
-#endif
-                break;
-        }
        return pid;
 }
@@ -1688,7 +1661,7 @@ static void __devinit migrate_timers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit timer_cpu_notify(struct notifier_block *self,
+static int __cpuinit timer_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -1708,7 +1681,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata timers_nb = {
+static struct notifier_block __cpuinitdata timers_nb = {
        .notifier_call  = timer_cpu_notify,
 };
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index eebb1d839235..835fe28b87a8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,7 +68,7 @@ struct workqueue_struct {
 /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
   threads to each one as cpus come/go. */
-static DEFINE_SPINLOCK(workqueue_lock);
+static DEFINE_MUTEX(workqueue_mutex);
 static LIST_HEAD(workqueues);
 static int singlethread_cpu;
@@ -93,9 +93,12 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
        spin_unlock_irqrestore(&cwq->lock, flags);
 }
-/*
+/**
- * Queue work on a workqueue. Return non-zero if it was successfully
+ * queue_work - queue work on a workqueue
- * added.
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns non-zero if it was successfully added.
 *
 * We queue the work to the CPU it was submitted, but there is no
 * guarantee that it will be processed by that CPU.
@@ -128,6 +131,14 @@ static void delayed_work_timer_fn(unsigned long __data)
        __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
+/**
+ * queue_delayed_work - queue work on a workqueue after delay
+ * @wq: workqueue to use
+ * @work: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns non-zero if it was successfully added.
+ */
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
                        struct work_struct *work, unsigned long delay)
 {
@@ -150,6 +161,15 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work);
+/**
+ * queue_delayed_work_on - queue work on specific CPU after delay
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @work: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns non-zero if it was successfully added.
+ */
 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct work_struct *work, unsigned long delay)
 {
@@ -275,8 +295,9 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
        }
 }
-/*
+/**
 * flush_workqueue - ensure that any scheduled work has run to completion.
+ * @wq: workqueue to flush
 *
 * Forces execution of the workqueue and blocks until its completion.
 * This is typically used in driver shutdown handlers.
@@ -299,10 +320,10 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
        } else {
                int cpu;
-                lock_cpu_hotplug();
+                mutex_lock(&workqueue_mutex);
                for_each_online_cpu(cpu)
                        flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
-                unlock_cpu_hotplug();
+                mutex_unlock(&workqueue_mutex);
        }
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -350,8 +371,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
        }
        wq->name = name;
-        /* We don't need the distraction of CPUs appearing and vanishing. */
+        mutex_lock(&workqueue_mutex);
-        lock_cpu_hotplug();
        if (singlethread) {
                INIT_LIST_HEAD(&wq->list);
                p = create_workqueue_thread(wq, singlethread_cpu);
@@ -360,9 +380,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
                else
                        wake_up_process(p);
        } else {
-                spin_lock(&workqueue_lock);
                list_add(&wq->list, &workqueues);
-                spin_unlock(&workqueue_lock);
                for_each_online_cpu(cpu) {
                        p = create_workqueue_thread(wq, cpu);
                        if (p) {
@@ -372,7 +390,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
                                destroy = 1;
                }
        }
-        unlock_cpu_hotplug();
+        mutex_unlock(&workqueue_mutex);
        /*
         * Was there any error during startup? If yes then clean up:
@@ -400,6 +418,12 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
                kthread_stop(p);
 }
+/**
+ * destroy_workqueue - safely terminate a workqueue
+ * @wq: target workqueue
+ *
+ * Safely destroy a workqueue. All work currently pending will be done first.
+ */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
        int cpu;
@@ -407,17 +431,15 @@ void destroy_workqueue(struct workqueue_struct *wq)
        flush_workqueue(wq);
        /* We don't need the distraction of CPUs appearing and vanishing. */
-        lock_cpu_hotplug();
+        mutex_lock(&workqueue_mutex);
        if (is_single_threaded(wq))
                cleanup_workqueue_thread(wq, singlethread_cpu);
        else {
                for_each_online_cpu(cpu)
                        cleanup_workqueue_thread(wq, cpu);
-                spin_lock(&workqueue_lock);
                list_del(&wq->list);
-                spin_unlock(&workqueue_lock);
        }
-        unlock_cpu_hotplug();
+        mutex_unlock(&workqueue_mutex);
        free_percpu(wq->cpu_wq);
        kfree(wq);
 }
@@ -425,18 +447,41 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
 static struct workqueue_struct *keventd_wq;
+/**
+ * schedule_work - put work task in global workqueue
+ * @work: job to be done
+ *
+ * This puts a job in the kernel-global workqueue.
+ */
 int fastcall schedule_work(struct work_struct *work)
 {
        return queue_work(keventd_wq, work);
 }
 EXPORT_SYMBOL(schedule_work);
+/**
+ * schedule_delayed_work - put work task in global workqueue after delay
+ * @work: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue.
+ */
 int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
 {
        return queue_delayed_work(keventd_wq, work, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
+/**
+ * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
+ * @cpu: cpu to use
+ * @work: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue on the specified CPU.
+ */
 int schedule_delayed_work_on(int cpu,
                        struct work_struct *work, unsigned long delay)
 {
@@ -465,11 +510,13 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info)
        if (!works)
                return -ENOMEM;
+        mutex_lock(&workqueue_mutex);
        for_each_online_cpu(cpu) {
                INIT_WORK(per_cpu_ptr(works, cpu), func, info);
                __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
                                per_cpu_ptr(works, cpu));
        }
+        mutex_unlock(&workqueue_mutex);
        flush_workqueue(keventd_wq);
        free_percpu(works);
        return 0;
@@ -585,6 +632,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
+                mutex_lock(&workqueue_mutex);
                /* Create a new workqueue thread for it. */
                list_for_each_entry(wq, &workqueues, list) {
                        if (!create_workqueue_thread(wq, hotcpu)) {
@@ -603,6 +651,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                        kthread_bind(cwq->thread, hotcpu);
                        wake_up_process(cwq->thread);
                }
+                mutex_unlock(&workqueue_mutex);
                break;
        case CPU_UP_CANCELED:
@@ -614,6 +663,15 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                                     any_online_cpu(cpu_online_map));
                        cleanup_workqueue_thread(wq, hotcpu);
                }
+                mutex_unlock(&workqueue_mutex);
+                break;
+        case CPU_DOWN_PREPARE:
+                mutex_lock(&workqueue_mutex);
+                break;
+        case CPU_DOWN_FAILED:
+                mutex_unlock(&workqueue_mutex);
                break;
        case CPU_DEAD:
@@ -621,6 +679,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                        cleanup_workqueue_thread(wq, hotcpu);
                list_for_each_entry(wq, &workqueues, list)
                        take_over_work(wq, hotcpu);
+                mutex_unlock(&workqueue_mutex);
                break;
        }