44 files changed, 2121 insertions, 1216 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ff1c11dc12..58908f9d15 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,9 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_FUTEX) += futex.o
+ifeq ($(CONFIG_COMPAT),y)
+obj-$(CONFIG_FUTEX) += futex_compat.o
+endif
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 065d8b4e51..b327f4d201 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -449,8 +449,8 @@ static void do_acct_process(long exitcode, struct file *file)
        /* calculate run_time in nsec*/
        do_posix_clock_monotonic_gettime(&uptime);
        run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
-        run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
+        run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
-                                        + current->start_time.tv_nsec;
+                       + current->group_leader->start_time.tv_nsec;
        /* convert nsec -> AHZ */
        elapsed = nsec_to_AHZ(run_time);
 #if ACCT_VERSION==3
@@ -469,10 +469,10 @@ static void do_acct_process(long exitcode, struct file *file)
 #endif
        do_div(elapsed, AHZ);
        ac.ac_btime = xtime.tv_sec - elapsed;
-        jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime,
+        jiffies = cputime_to_jiffies(cputime_add(current->utime,
                                                 current->signal->utime));
        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
-        jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime,
+        jiffies = cputime_to_jiffies(cputime_add(current->stime,
                                                 current->signal->stime));
        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
        /* we really need to bite the bullet and change layout */
@@ -522,9 +522,9 @@ static void do_acct_process(long exitcode, struct file *file)
        ac.ac_io = encode_comp_t(0 /* current->io_usage */);    /* %% */
        ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
        ac.ac_minflt = encode_comp_t(current->signal->min_flt +
-                                     current->group_leader->min_flt);
+                                     current->min_flt);
        ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
-                                     current->group_leader->maj_flt);
+                                     current->maj_flt);
        ac.ac_swaps = encode_comp_t(0);
        ac.ac_exitcode = exitcode;
diff --git a/kernel/audit.c b/kernel/audit.c
index 04fe2e301b..df57b493e1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -55,6 +55,9 @@
 #include <net/netlink.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
+#include <linux/selinux.h>
+#include "audit.h"
 /* No auditing will take place until audit_initialized != 0.
 * (Initialization happens after skb_init is called.) */
@@ -227,49 +230,103 @@ void audit_log_lost(const char *message)
        }
 }
-static int audit_set_rate_limit(int limit, uid_t loginuid)
+static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
 {
-        int old          = audit_rate_limit;
+        int old = audit_rate_limit;
-        audit_rate_limit = limit;
-        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 
+        if (sid) {
+                char *ctx = NULL;
+                u32 len;
+                int rc;
+                if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+                        return rc;
+                else
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                "audit_rate_limit=%d old=%d by auid=%u subj=%s",
+                                limit, old, loginuid, ctx);
+                kfree(ctx);
+        } else
+                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
                        "audit_rate_limit=%d old=%d by auid=%u",
-                        audit_rate_limit, old, loginuid);
+                        limit, old, loginuid);
+        audit_rate_limit = limit;
        return old;
 }
-static int audit_set_backlog_limit(int limit, uid_t loginuid)
+static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
 {
-        int old          = audit_backlog_limit;
+        int old = audit_backlog_limit;
-        audit_backlog_limit = limit;
-        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+        if (sid) {
+                char *ctx = NULL;
+                u32 len;
+                int rc;
+                if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+                        return rc;
+                else
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                            "audit_backlog_limit=%d old=%d by auid=%u subj=%s",
+                                limit, old, loginuid, ctx);
+                kfree(ctx);
+        } else
+                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
                        "audit_backlog_limit=%d old=%d by auid=%u",
-                        audit_backlog_limit, old, loginuid);
+                        limit, old, loginuid);
+        audit_backlog_limit = limit;
        return old;
 }
-static int audit_set_enabled(int state, uid_t loginuid)
+static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
 {
-        int old          = audit_enabled;
+        int old = audit_enabled;
        if (state != 0 && state != 1)
                return -EINVAL;
-        audit_enabled = state;
-        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+        if (sid) {
+                char *ctx = NULL;
+                u32 len;
+                int rc;
+                if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+                        return rc;
+                else
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                "audit_enabled=%d old=%d by auid=%u subj=%s",
+                                state, old, loginuid, ctx);
+                kfree(ctx);
+        } else
+                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
                        "audit_enabled=%d old=%d by auid=%u",
-                        audit_enabled, old, loginuid);
+                        state, old, loginuid);
+        audit_enabled = state;
        return old;
 }
-static int audit_set_failure(int state, uid_t loginuid)
+static int audit_set_failure(int state, uid_t loginuid, u32 sid)
 {
-        int old          = audit_failure;
+        int old = audit_failure;
        if (state != AUDIT_FAIL_SILENT
            && state != AUDIT_FAIL_PRINTK
            && state != AUDIT_FAIL_PANIC)
                return -EINVAL;
-        audit_failure = state;
-        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+        if (sid) {
+                char *ctx = NULL;
+                u32 len;
+                int rc;
+                if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+                        return rc;
+                else
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                "audit_failure=%d old=%d by auid=%u subj=%s",
+                                state, old, loginuid, ctx);
+                kfree(ctx);
+        } else
+                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
                        "audit_failure=%d old=%d by auid=%u",
-                        audit_failure, old, loginuid);
+                        state, old, loginuid);
+        audit_failure = state;
        return old;
 }
@@ -387,7 +444,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
 static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
-        u32                     uid, pid, seq;
+        u32                     uid, pid, seq, sid;
        void                    *data;
        struct audit_status     *status_get, status_set;
        int                     err;
@@ -413,6 +470,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        pid  = NETLINK_CREDS(skb)->pid;
        uid  = NETLINK_CREDS(skb)->uid;
        loginuid = NETLINK_CB(skb).loginuid;
+        sid  = NETLINK_CB(skb).sid;
        seq  = nlh->nlmsg_seq;
        data = NLMSG_DATA(nlh);
@@ -433,25 +491,43 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        return -EINVAL;
                status_get   = (struct audit_status *)data;
                if (status_get->mask & AUDIT_STATUS_ENABLED) {
-                        err = audit_set_enabled(status_get->enabled, loginuid);
+                        err = audit_set_enabled(status_get->enabled,
+                                                        loginuid, sid);
                        if (err < 0) return err;
                }
                if (status_get->mask & AUDIT_STATUS_FAILURE) {
-                        err = audit_set_failure(status_get->failure, loginuid);
+                        err = audit_set_failure(status_get->failure,
+                                                         loginuid, sid);
                        if (err < 0) return err;
                }
                if (status_get->mask & AUDIT_STATUS_PID) {
                        int old   = audit_pid;
+                        if (sid) {
+                                char *ctx = NULL;
+                                u32 len;
+                                int rc;
+                                if ((rc = selinux_ctxid_to_string(
+                                                sid, &ctx, &len)))
+                                        return rc;
+                                else
+                                        audit_log(NULL, GFP_KERNEL,
+                                                AUDIT_CONFIG_CHANGE,
+                                                "audit_pid=%d old=%d by auid=%u subj=%s",
+                                                status_get->pid, old,
+                                                loginuid, ctx);
+                                kfree(ctx);
+                        } else
+                                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                        "audit_pid=%d old=%d by auid=%u",
+                                          status_get->pid, old, loginuid);
                        audit_pid = status_get->pid;
-                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-                                "audit_pid=%d old=%d by auid=%u",
-                                  audit_pid, old, loginuid);
                }
                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
-                        audit_set_rate_limit(status_get->rate_limit, loginuid);
+                        audit_set_rate_limit(status_get->rate_limit,
+                                                         loginuid, sid);
                if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
                        audit_set_backlog_limit(status_get->backlog_limit,
-                                                        loginuid);
+                                                        loginuid, sid);
                break;
        case AUDIT_USER:
        case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
@@ -465,8 +541,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
                        if (ab) {
                                audit_log_format(ab,
-                                                 "user pid=%d uid=%u auid=%u msg='%.1024s'",
+                                                 "user pid=%d uid=%u auid=%u",
-                                                 pid, uid, loginuid, (char *)data);
+                                                 pid, uid, loginuid);
+                                if (sid) {
+                                        char *ctx = NULL;
+                                        u32 len;
+                                        if (selinux_ctxid_to_string(
+                                                        sid, &ctx, &len)) {
+                                                audit_log_format(ab, 
+                                                        " ssid=%u", sid);
+                                                /* Maybe call audit_panic? */
+                                        } else
+                                                audit_log_format(ab, 
+                                                        " subj=%s", ctx);
+                                        kfree(ctx);
+                                }
+                                audit_log_format(ab, " msg='%.1024s'",
+                                         (char *)data);
                                audit_set_pid(ab, pid);
                                audit_log_end(ab);
                        }
@@ -480,7 +571,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        case AUDIT_LIST:
                err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
                                           uid, seq, data, nlmsg_len(nlh),
-                                           loginuid);
+                                           loginuid, sid);
                break;
        case AUDIT_ADD_RULE:
        case AUDIT_DEL_RULE:
@@ -490,7 +581,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        case AUDIT_LIST_RULES:
                err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
                                           uid, seq, data, nlmsg_len(nlh),
-                                           loginuid);
+                                           loginuid, sid);
                break;
        case AUDIT_SIGNAL_INFO:
                sig_data.uid = audit_sig_uid;
@@ -564,6 +655,11 @@ static int __init audit_init(void)
        skb_queue_head_init(&audit_skb_queue);
        audit_initialized = 1;
        audit_enabled = audit_default;
+        /* Register the callback with selinux.  This callback will be invoked
+         * when a new policy is loaded. */
+        selinux_audit_set_callback(&selinux_audit_rule_update);
        audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
        return 0;
 }
@@ -578,7 +674,7 @@ static int __init audit_enable(char *str)
               audit_initialized ? "" : " (after initialization)");
        if (audit_initialized)
                audit_enabled = audit_default;
-        return 0;
+        return 1;
 }
 __setup("audit=", audit_enable);
diff --git a/kernel/audit.h b/kernel/audit.h
index bc5392076e..6f733920fd 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -54,9 +54,11 @@ enum audit_state {
 /* Rule lists */
 struct audit_field {
-        u32                     type;
+        u32                             type;
-        u32                     val;
+        u32                             val;
-        u32                     op;
+        u32                             op;
+        char                            *se_str;
+        struct selinux_audit_rule       *se_rule;
 };
 struct audit_krule {
@@ -86,3 +88,5 @@ extern void		    audit_send_reply(int pid, int seq, int type,
 extern void                 audit_log_lost(const char *message);
 extern void                 audit_panic(const char *message);
 extern struct mutex audit_netlink_mutex;
+extern int selinux_audit_rule_update(void);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index d3a8539f3a..7c134906d6 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -23,6 +23,7 @@
 #include <linux/audit.h>
 #include <linux/kthread.h>
 #include <linux/netlink.h>
+#include <linux/selinux.h>
 #include "audit.h"
 /* There are three lists of rules -- one to search at task creation
@@ -42,6 +43,13 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 static inline void audit_free_rule(struct audit_entry *e)
 {
+        int i;
+        if (e->rule.fields)
+                for (i = 0; i < e->rule.field_count; i++) {
+                        struct audit_field *f = &e->rule.fields[i];
+                        kfree(f->se_str);
+                        selinux_audit_rule_free(f->se_rule);
+                }
        kfree(e->rule.fields);
        kfree(e);
 }
@@ -52,9 +60,29 @@ static inline void audit_free_rule_rcu(struct rcu_head *head)
        audit_free_rule(e);
 }
+/* Initialize an audit filterlist entry. */
+static inline struct audit_entry *audit_init_entry(u32 field_count)
+{
+        struct audit_entry *entry;
+        struct audit_field *fields;
+        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+        if (unlikely(!entry))
+                return NULL;
+        fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL);
+        if (unlikely(!fields)) {
+                kfree(entry);
+                return NULL;
+        }
+        entry->rule.fields = fields;
+        return entry;
+}
 /* Unpack a filter field's string representation from user-space
 * buffer. */
-static __attribute__((unused)) char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
+static char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
 {
        char *str;
@@ -84,7 +112,6 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
 {
        unsigned listnr;
        struct audit_entry *entry;
-        struct audit_field *fields;
        int i, err;
        err = -EINVAL;
@@ -108,23 +135,14 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
                goto exit_err;
        err = -ENOMEM;
-        entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+        entry = audit_init_entry(rule->field_count);
-        if (unlikely(!entry))
+        if (!entry)
-                goto exit_err;
-        fields = kmalloc(sizeof(*fields) * rule->field_count, GFP_KERNEL);
-        if (unlikely(!fields)) {
-                kfree(entry);
                goto exit_err;
-        }
-        memset(&entry->rule, 0, sizeof(struct audit_krule));
-        memset(fields, 0, sizeof(struct audit_field));
        entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
        entry->rule.listnr = listnr;
        entry->rule.action = rule->action;
        entry->rule.field_count = rule->field_count;
-        entry->rule.fields = fields;
        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
                entry->rule.mask[i] = rule->mask[i];
@@ -150,15 +168,20 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &entry->rule.fields[i];
-                if (rule->fields[i] & AUDIT_UNUSED_BITS) {
-                        err = -EINVAL;
-                        goto exit_free;
-                }
                f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
                f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
                f->val = rule->values[i];
+                if (f->type & AUDIT_UNUSED_BITS ||
+                    f->type == AUDIT_SE_USER ||
+                    f->type == AUDIT_SE_ROLE ||
+                    f->type == AUDIT_SE_TYPE ||
+                    f->type == AUDIT_SE_SEN ||
+                    f->type == AUDIT_SE_CLR) {
+                        err = -EINVAL;
+                        goto exit_free;
+                }
                entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
                /* Support for legacy operators where
@@ -188,8 +211,9 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
        int err = 0;
        struct audit_entry *entry;
        void *bufp;
-        /* size_t remain = datasz - sizeof(struct audit_rule_data); */
+        size_t remain = datasz - sizeof(struct audit_rule_data);
        int i;
+        char *str;
        entry = audit_to_entry_common((struct audit_rule *)data);
        if (IS_ERR(entry))
@@ -207,10 +231,35 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                f->op = data->fieldflags[i] & AUDIT_OPERATORS;
                f->type = data->fields[i];
+                f->val = data->values[i];
+                f->se_str = NULL;
+                f->se_rule = NULL;
                switch(f->type) {
-                /* call type-specific conversion routines here */
+                case AUDIT_SE_USER:
-                default:
+                case AUDIT_SE_ROLE:
-                        f->val = data->values[i];
+                case AUDIT_SE_TYPE:
+                case AUDIT_SE_SEN:
+                case AUDIT_SE_CLR:
+                        str = audit_unpack_string(&bufp, &remain, f->val);
+                        if (IS_ERR(str))
+                                goto exit_free;
+                        entry->rule.buflen += f->val;
+                        err = selinux_audit_rule_init(f->type, f->op, str,
+                                                      &f->se_rule);
+                        /* Keep currently invalid fields around in case they
+                         * become valid after a policy reload. */
+                        if (err == -EINVAL) {
+                                printk(KERN_WARNING "audit rule for selinux "
+                                       "\'%s\' is invalid\n",  str);
+                                err = 0;
+                        }
+                        if (err) {
+                                kfree(str);
+                                goto exit_free;
+                        } else
+                                f->se_str = str;
+                        break;
                }
        }
@@ -286,7 +335,14 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
                data->fields[i] = f->type;
                data->fieldflags[i] = f->op;
                switch(f->type) {
-                /* call type-specific conversion routines here */
+                case AUDIT_SE_USER:
+                case AUDIT_SE_ROLE:
+                case AUDIT_SE_TYPE:
+                case AUDIT_SE_SEN:
+                case AUDIT_SE_CLR:
+                        data->buflen += data->values[i] =
+                                audit_pack_string(&bufp, f->se_str);
+                        break;
                default:
                        data->values[i] = f->val;
                }
@@ -314,7 +370,14 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
                        return 1;
                switch(a->fields[i].type) {
-                /* call type-specific comparison routines here */
+                case AUDIT_SE_USER:
+                case AUDIT_SE_ROLE:
+                case AUDIT_SE_TYPE:
+                case AUDIT_SE_SEN:
+                case AUDIT_SE_CLR:
+                        if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
+                                return 1;
+                        break;
                default:
                        if (a->fields[i].val != b->fields[i].val)
                                return 1;
@@ -328,6 +391,81 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
        return 0;
 }
+/* Duplicate selinux field information.  The se_rule is opaque, so must be
+ * re-initialized. */
+static inline int audit_dupe_selinux_field(struct audit_field *df,
+                                           struct audit_field *sf)
+{
+        int ret = 0;
+        char *se_str;
+        /* our own copy of se_str */
+        se_str = kstrdup(sf->se_str, GFP_KERNEL);
+        if (unlikely(IS_ERR(se_str)))
+            return -ENOMEM;
+        df->se_str = se_str;
+        /* our own (refreshed) copy of se_rule */
+        ret = selinux_audit_rule_init(df->type, df->op, df->se_str,
+                                      &df->se_rule);
+        /* Keep currently invalid fields around in case they
+         * become valid after a policy reload. */
+        if (ret == -EINVAL) {
+                printk(KERN_WARNING "audit rule for selinux \'%s\' is "
+                       "invalid\n", df->se_str);
+                ret = 0;
+        }
+        return ret;
+}
+/* Duplicate an audit rule.  This will be a deep copy with the exception
+ * of the watch - that pointer is carried over.  The selinux specific fields
+ * will be updated in the copy.  The point is to be able to replace the old
+ * rule with the new rule in the filterlist, then free the old rule. */
+static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
+{
+        u32 fcount = old->field_count;
+        struct audit_entry *entry;
+        struct audit_krule *new;
+        int i, err = 0;
+        entry = audit_init_entry(fcount);
+        if (unlikely(!entry))
+                return ERR_PTR(-ENOMEM);
+        new = &entry->rule;
+        new->vers_ops = old->vers_ops;
+        new->flags = old->flags;
+        new->listnr = old->listnr;
+        new->action = old->action;
+        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+                new->mask[i] = old->mask[i];
+        new->buflen = old->buflen;
+        new->field_count = old->field_count;
+        memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
+        /* deep copy this information, updating the se_rule fields, because
+         * the originals will all be freed when the old rule is freed. */
+        for (i = 0; i < fcount; i++) {
+                switch (new->fields[i].type) {
+                case AUDIT_SE_USER:
+                case AUDIT_SE_ROLE:
+                case AUDIT_SE_TYPE:
+                case AUDIT_SE_SEN:
+                case AUDIT_SE_CLR:
+                        err = audit_dupe_selinux_field(&new->fields[i],
+                                                       &old->fields[i]);
+                }
+                if (err) {
+                        audit_free_rule(entry);
+                        return ERR_PTR(err);
+                }
+        }
+        return entry;
+}
 /* Add rule to given filterlist if not a duplicate.  Protected by
 * audit_netlink_mutex. */
 static inline int audit_add_rule(struct audit_entry *entry,
@@ -448,9 +586,10 @@ static int audit_list_rules(void *_dest)
 * @data: payload data
 * @datasz: size of payload data
 * @loginuid: loginuid of sender
+ * @sid: SE Linux Security ID of sender
 */
 int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
-                         size_t datasz, uid_t loginuid)
+                         size_t datasz, uid_t loginuid, u32 sid)
 {
        struct task_struct *tsk;
        int *dest;
@@ -493,9 +632,23 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                err = audit_add_rule(entry,
                                     &audit_filter_list[entry->rule.listnr]);
-                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                if (sid) {
-                        "auid=%u add rule to list=%d res=%d\n",
+                        char *ctx = NULL;
-                        loginuid, entry->rule.listnr, !err);
+                        u32 len;
+                        if (selinux_ctxid_to_string(sid, &ctx, &len)) {
+                                /* Maybe call audit_panic? */
+                                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                 "auid=%u ssid=%u add rule to list=%d res=%d",
+                                 loginuid, sid, entry->rule.listnr, !err);
+                        } else
+                                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                 "auid=%u subj=%s add rule to list=%d res=%d",
+                                 loginuid, ctx, entry->rule.listnr, !err);
+                        kfree(ctx);
+                } else
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                "auid=%u add rule to list=%d res=%d",
+                                loginuid, entry->rule.listnr, !err);
                if (err)
                        audit_free_rule(entry);
@@ -511,9 +664,24 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                err = audit_del_rule(entry,
                                     &audit_filter_list[entry->rule.listnr]);
-                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-                        "auid=%u remove rule from list=%d res=%d\n",
+                if (sid) {
-                        loginuid, entry->rule.listnr, !err);
+                        char *ctx = NULL;
+                        u32 len;
+                        if (selinux_ctxid_to_string(sid, &ctx, &len)) {
+                                /* Maybe call audit_panic? */
+                                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                        "auid=%u ssid=%u remove rule from list=%d res=%d",
+                                         loginuid, sid, entry->rule.listnr, !err);
+                        } else
+                                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                        "auid=%u subj=%s remove rule from list=%d res=%d",
+                                         loginuid, ctx, entry->rule.listnr, !err);
+                        kfree(ctx);
+                } else
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                "auid=%u remove rule from list=%d res=%d",
+                                loginuid, entry->rule.listnr, !err);
                audit_free_rule(entry);
                break;
@@ -628,3 +796,62 @@ unlock_and_return:
        rcu_read_unlock();
        return result;
 }
+/* Check to see if the rule contains any selinux fields.  Returns 1 if there
+   are selinux fields specified in the rule, 0 otherwise. */
+static inline int audit_rule_has_selinux(struct audit_krule *rule)
+{
+        int i;
+        for (i = 0; i < rule->field_count; i++) {
+                struct audit_field *f = &rule->fields[i];
+                switch (f->type) {
+                case AUDIT_SE_USER:
+                case AUDIT_SE_ROLE:
+                case AUDIT_SE_TYPE:
+                case AUDIT_SE_SEN:
+                case AUDIT_SE_CLR:
+                        return 1;
+                }
+        }
+        return 0;
+}
+/* This function will re-initialize the se_rule field of all applicable rules.
+ * It will traverse the filter lists serarching for rules that contain selinux
+ * specific filter fields.  When such a rule is found, it is copied, the
+ * selinux field is re-initialized, and the old rule is replaced with the
+ * updated rule. */
+int selinux_audit_rule_update(void)
+{
+        struct audit_entry *entry, *n, *nentry;
+        int i, err = 0;
+        /* audit_netlink_mutex synchronizes the writers */
+        mutex_lock(&audit_netlink_mutex);
+        for (i = 0; i < AUDIT_NR_FILTERS; i++) {
+                list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
+                        if (!audit_rule_has_selinux(&entry->rule))
+                                continue;
+                        nentry = audit_dupe_rule(&entry->rule);
+                        if (unlikely(IS_ERR(nentry))) {
+                                /* save the first error encountered for the
+                                 * return value */
+                                if (!err)
+                                        err = PTR_ERR(nentry);
+                                audit_panic("error updating selinux filters");
+                                list_del_rcu(&entry->list);
+                        } else {
+                                list_replace_rcu(&entry->list, &nentry->list);
+                        }
+                        call_rcu(&entry->rcu, audit_free_rule_rcu);
+                }
+        }
+        mutex_unlock(&audit_netlink_mutex);
+        return err;
+}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7f160df21a..1c03a4ed1b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -58,6 +58,7 @@
 #include <linux/security.h>
 #include <linux/list.h>
 #include <linux/tty.h>
+#include <linux/selinux.h>
 #include "audit.h"
@@ -89,7 +90,7 @@ struct audit_names {
        uid_t           uid;
        gid_t           gid;
        dev_t           rdev;
-        char            *ctx;
+        u32             osid;
 };
 struct audit_aux_data {
@@ -106,7 +107,7 @@ struct audit_aux_data_ipcctl {
        uid_t                   uid;
        gid_t                   gid;
        mode_t                  mode;
-        char                    *ctx;
+        u32                     osid;
 };
 struct audit_aux_data_socketcall {
@@ -167,7 +168,8 @@ static int audit_filter_rules(struct task_struct *tsk,
                              struct audit_context *ctx,
                              enum audit_state *state)
 {
-        int i, j;
+        int i, j, need_sid = 1;
+        u32 sid;
        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
@@ -257,6 +259,27 @@ static int audit_filter_rules(struct task_struct *tsk,
                        if (ctx)
                                result = audit_comparator(ctx->loginuid, f->op, f->val);
                        break;
+                case AUDIT_SE_USER:
+                case AUDIT_SE_ROLE:
+                case AUDIT_SE_TYPE:
+                case AUDIT_SE_SEN:
+                case AUDIT_SE_CLR:
+                        /* NOTE: this may return negative values indicating
+                           a temporary error.  We simply treat this as a
+                           match for now to avoid losing information that
+                           may be wanted.   An error message will also be
+                           logged upon error */
+                        if (f->se_rule) {
+                                if (need_sid) {
+                                        selinux_task_ctxid(tsk, &sid);
+                                        need_sid = 0;
+                                }
+                                result = selinux_audit_rule_match(sid, f->type,
+                                                                  f->op,
+                                                                  f->se_rule,
+                                                                  ctx);
+                        }
+                        break;
                case AUDIT_ARG0:
                case AUDIT_ARG1:
                case AUDIT_ARG2:
@@ -329,7 +352,6 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
        return AUDIT_BUILD_CONTEXT;
 }
-/* This should be called with task_lock() held. */
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
                                                      int return_valid,
                                                      int return_code)
@@ -391,9 +413,6 @@ static inline void audit_free_names(struct audit_context *context)
 #endif
        for (i = 0; i < context->name_count; i++) {
-                char *p = context->names[i].ctx;
-                context->names[i].ctx = NULL;
-                kfree(p);
                if (context->names[i].name)
                        __putname(context->names[i].name);
        }
@@ -416,11 +435,6 @@ static inline void audit_free_aux(struct audit_context *context)
                        dput(axi->dentry);
                        mntput(axi->mnt);
                }
-                if ( aux->type == AUDIT_IPC ) {
-                        struct audit_aux_data_ipcctl *axi = (void *)aux;
-                        if (axi->ctx)
-                                kfree(axi->ctx);
-                }
                context->aux = aux->next;
                kfree(aux);
@@ -506,7 +520,7 @@ static inline void audit_free_context(struct audit_context *context)
                printk(KERN_ERR "audit: freed %d contexts\n", count);
 }
-static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask)
+static void audit_log_task_context(struct audit_buffer *ab)
 {
        char *ctx = NULL;
        ssize_t len = 0;
@@ -518,7 +532,7 @@ static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask)
                return;
        }
-        ctx = kmalloc(len, gfp_mask);
+        ctx = kmalloc(len, GFP_KERNEL);
        if (!ctx)
                goto error_path;
@@ -536,47 +550,46 @@ error_path:
        return;
 }
-static void audit_log_task_info(struct audit_buffer *ab, gfp_t gfp_mask)
+static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
-        char name[sizeof(current->comm)];
+        char name[sizeof(tsk->comm)];
-        struct mm_struct *mm = current->mm;
+        struct mm_struct *mm = tsk->mm;
        struct vm_area_struct *vma;
-        get_task_comm(name, current);
+        /* tsk == current */
+        get_task_comm(name, tsk);
        audit_log_format(ab, " comm=");
        audit_log_untrustedstring(ab, name);
-        if (!mm)
+        if (mm) {
-                return;
+                down_read(&mm->mmap_sem);
+                vma = mm->mmap;
-        /*
+                while (vma) {
-         * this is brittle; all callers that pass GFP_ATOMIC will have
+                        if ((vma->vm_flags & VM_EXECUTABLE) &&
-         * NULL current->mm and we won't get here.
+                            vma->vm_file) {
-         */
+                                audit_log_d_path(ab, "exe=",
-        down_read(&mm->mmap_sem);
+                                                 vma->vm_file->f_dentry,
-        vma = mm->mmap;
+                                                 vma->vm_file->f_vfsmnt);
-        while (vma) {
+                                break;
-                if ((vma->vm_flags & VM_EXECUTABLE) &&
+                        }
-                    vma->vm_file) {
+                        vma = vma->vm_next;
-                        audit_log_d_path(ab, "exe=",
-                                         vma->vm_file->f_dentry,
-                                         vma->vm_file->f_vfsmnt);
-                        break;
                }
-                vma = vma->vm_next;
+                up_read(&mm->mmap_sem);
        }
-        up_read(&mm->mmap_sem);
+        audit_log_task_context(ab);
-        audit_log_task_context(ab, gfp_mask);
 }
-static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
+static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
-        int i;
+        int i, call_panic = 0;
        struct audit_buffer *ab;
        struct audit_aux_data *aux;
        const char *tty;
-        ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL);
+        /* tsk == current */
+        ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
        if (!ab)
                return;         /* audit_panic has been called */
        audit_log_format(ab, "arch=%x syscall=%d",
@@ -587,8 +600,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
                audit_log_format(ab, " success=%s exit=%ld", 
                                 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
                                 context->return_code);
-        if (current->signal->tty && current->signal->tty->name)
+        if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
-                tty = current->signal->tty->name;
+                tty = tsk->signal->tty->name;
        else
                tty = "(none)";
        audit_log_format(ab,
@@ -607,12 +620,12 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
                  context->gid,
                  context->euid, context->suid, context->fsuid,
                  context->egid, context->sgid, context->fsgid, tty);
-        audit_log_task_info(ab, gfp_mask);
+        audit_log_task_info(ab, tsk);
        audit_log_end(ab);
        for (aux = context->aux; aux; aux = aux->next) {
-                ab = audit_log_start(context, gfp_mask, aux->type);
+                ab = audit_log_start(context, GFP_KERNEL, aux->type);
                if (!ab)
                        continue; /* audit_panic has been called */
@@ -620,8 +633,39 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
                case AUDIT_IPC: {
                        struct audit_aux_data_ipcctl *axi = (void *)aux;
                        audit_log_format(ab, 
-                                         " qbytes=%lx iuid=%u igid=%u mode=%x obj=%s",
+                                 " qbytes=%lx iuid=%u igid=%u mode=%x",
-                                         axi->qbytes, axi->uid, axi->gid, axi->mode, axi->ctx);
+                                 axi->qbytes, axi->uid, axi->gid, axi->mode);
+                        if (axi->osid != 0) {
+                                char *ctx = NULL;
+                                u32 len;
+                                if (selinux_ctxid_to_string(
+                                                axi->osid, &ctx, &len)) {
+                                        audit_log_format(ab, " osid=%u",
+                                                        axi->osid);
+                                        call_panic = 1;
+                                } else
+                                        audit_log_format(ab, " obj=%s", ctx);
+                                kfree(ctx);
+                        }
+                        break; }
+                case AUDIT_IPC_SET_PERM: {
+                        struct audit_aux_data_ipcctl *axi = (void *)aux;
+                        audit_log_format(ab,
+                                " new qbytes=%lx new iuid=%u new igid=%u new mode=%x",
+                                axi->qbytes, axi->uid, axi->gid, axi->mode);
+                        if (axi->osid != 0) {
+                                char *ctx = NULL;
+                                u32 len;
+                                if (selinux_ctxid_to_string(
+                                                axi->osid, &ctx, &len)) {
+                                        audit_log_format(ab, " osid=%u",
+                                                        axi->osid);
+                                        call_panic = 1;
+                                } else
+                                        audit_log_format(ab, " obj=%s", ctx);
+                                kfree(ctx);
+                        }
                        break; }
                case AUDIT_SOCKETCALL: {
@@ -649,7 +693,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
        }
        if (context->pwd && context->pwdmnt) {
-                ab = audit_log_start(context, gfp_mask, AUDIT_CWD);
+                ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
                if (ab) {
                        audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
                        audit_log_end(ab);
@@ -659,7 +703,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
                unsigned long ino  = context->names[i].ino;
                unsigned long pino = context->names[i].pino;
-                ab = audit_log_start(context, gfp_mask, AUDIT_PATH);
+                ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
                if (!ab)
                        continue; /* audit_panic has been called */
@@ -685,32 +729,35 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
                                         context->names[i].gid, 
                                         MAJOR(context->names[i].rdev), 
                                         MINOR(context->names[i].rdev));
-                if (context->names[i].ctx) {
+                if (context->names[i].osid != 0) {
-                        audit_log_format(ab, " obj=%s",
+                        char *ctx = NULL;
-                                        context->names[i].ctx);
+                        u32 len;
+                        if (selinux_ctxid_to_string(
+                                context->names[i].osid, &ctx, &len)) {
+                                audit_log_format(ab, " osid=%u",
+                                                context->names[i].osid);
+                                call_panic = 2;
+                        } else
+                                audit_log_format(ab, " obj=%s", ctx);
+                        kfree(ctx);
                }
                audit_log_end(ab);
        }
+        if (call_panic)
+                audit_panic("error converting sid to string");
 }
 /**
 * audit_free - free a per-task audit context
 * @tsk: task whose audit context block to free
 *
- * Called from copy_process and __put_task_struct.
+ * Called from copy_process and do_exit
 */
 void audit_free(struct task_struct *tsk)
 {
        struct audit_context *context;
-        /*
-         * No need to lock the task - when we execute audit_free()
-         * then the task has no external references anymore, and
-         * we are tearing it down. (The locking also confuses
-         * DEBUG_LOCKDEP - this freeing may occur in softirq
-         * contexts as well, via RCU.)
-         */
        context = audit_get_context(tsk, 0, 0);
        if (likely(!context))
                return;
@@ -719,8 +766,9 @@ void audit_free(struct task_struct *tsk)
         * function (e.g., exit_group), then free context block. 
         * We use GFP_ATOMIC here because we might be doing this 
         * in the context of the idle thread */
+        /* that can happen only if we are called from do_exit() */
        if (context->in_syscall && context->auditable)
-                audit_log_exit(context, GFP_ATOMIC);
+                audit_log_exit(context, tsk);
        audit_free_context(context);
 }
@@ -743,10 +791,11 @@ void audit_free(struct task_struct *tsk)
 * will only be written if another part of the kernel requests that it
 * be written).
 */
-void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
+void audit_syscall_entry(int arch, int major,
                         unsigned long a1, unsigned long a2,
                         unsigned long a3, unsigned long a4)
 {
+        struct task_struct *tsk = current;
        struct audit_context *context = tsk->audit_context;
        enum audit_state     state;
@@ -824,22 +873,18 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
 * message), then write out the syscall information.  In call cases,
 * free the names stored from getname().
 */
-void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
+void audit_syscall_exit(int valid, long return_code)
 {
+        struct task_struct *tsk = current;
        struct audit_context *context;
-        get_task_struct(tsk);
-        task_lock(tsk);
        context = audit_get_context(tsk, valid, return_code);
-        task_unlock(tsk);
-        /* Not having a context here is ok, since the parent may have
-         * called __put_task_struct. */
        if (likely(!context))
-                goto out;
+                return;
        if (context->in_syscall && context->auditable)
-                audit_log_exit(context, GFP_KERNEL);
+                audit_log_exit(context, tsk);
        context->in_syscall = 0;
        context->auditable  = 0;
@@ -854,8 +899,6 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
                audit_free_aux(context);
                tsk->audit_context = context;
        }
- out:
-        put_task_struct(tsk);
 }
 /**
@@ -936,40 +979,11 @@ void audit_putname(const char *name)
 #endif
 }
-void audit_inode_context(int idx, const struct inode *inode)
+static void audit_inode_context(int idx, const struct inode *inode)
 {
        struct audit_context *context = current->audit_context;
-        const char *suffix = security_inode_xattr_getsuffix();
-        char *ctx = NULL;
-        int len = 0;
-        if (!suffix)
-                goto ret;
-        len = security_inode_getsecurity(inode, suffix, NULL, 0, 0);
-        if (len == -EOPNOTSUPP)
-                goto ret;
-        if (len < 0) 
-                goto error_path;
-        ctx = kmalloc(len, GFP_KERNEL);
-        if (!ctx) 
-                goto error_path;
-        len = security_inode_getsecurity(inode, suffix, ctx, len, 0);
+        selinux_get_inode_sid(inode, &context->names[idx].osid);
-        if (len < 0)
-                goto error_path;
-        kfree(context->names[idx].ctx);
-        context->names[idx].ctx = ctx;
-        goto ret;
-error_path:
-        if (ctx)
-                kfree(ctx);
-        audit_panic("error in audit_inode_context");
-ret:
-        return;
 }
@@ -1155,40 +1169,37 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
        return ctx ? ctx->loginuid : -1;
 }
-static char *audit_ipc_context(struct kern_ipc_perm *ipcp)
+/**
+ * audit_ipc_obj - record audit data for ipc object
+ * @ipcp: ipc permissions
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
+        struct audit_aux_data_ipcctl *ax;
        struct audit_context *context = current->audit_context;
-        char *ctx = NULL;
-        int len = 0;
        if (likely(!context))
-                return NULL;
+                return 0;
-        len = security_ipc_getsecurity(ipcp, NULL, 0);
-        if (len == -EOPNOTSUPP)
-                goto ret;
-        if (len < 0)
-                goto error_path;
-        ctx = kmalloc(len, GFP_ATOMIC);
-        if (!ctx)
-                goto error_path;
-        len = security_ipc_getsecurity(ipcp, ctx, len);
+        ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-        if (len < 0)
+        if (!ax)
-                goto error_path;
+                return -ENOMEM;
-        return ctx;
+        ax->uid = ipcp->uid;
+        ax->gid = ipcp->gid;
+        ax->mode = ipcp->mode;
+        selinux_get_ipc_sid(ipcp, &ax->osid);
-error_path:
+        ax->d.type = AUDIT_IPC;
-        kfree(ctx);
+        ax->d.next = context->aux;
-        audit_panic("error in audit_ipc_context");
+        context->aux = (void *)ax;
-ret:
+        return 0;
-        return NULL;
 }
 /**
- * audit_ipc_perms - record audit data for ipc
+ * audit_ipc_set_perm - record audit data for new ipc permissions
 * @qbytes: msgq bytes
 * @uid: msgq user id
 * @gid: msgq group id
@@ -1196,7 +1207,7 @@ ret:
 *
 * Returns 0 for success or NULL context or < 0 on error.
 */
-int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
+int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
 {
        struct audit_aux_data_ipcctl *ax;
        struct audit_context *context = current->audit_context;
@@ -1212,9 +1223,9 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, str
        ax->uid = uid;
        ax->gid = gid;
        ax->mode = mode;
-        ax->ctx = audit_ipc_context(ipcp);
+        selinux_get_ipc_sid(ipcp, &ax->osid);
-        ax->d.type = AUDIT_IPC;
+        ax->d.type = AUDIT_IPC_SET_PERM;
        ax->d.next = context->aux;
        context->aux = (void *)ax;
        return 0;
diff --git a/kernel/compat.c b/kernel/compat.c
index b9bdd1271f..c1601a84f8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -17,7 +17,6 @@
 #include <linux/time.h>
 #include <linux/signal.h>
 #include <linux/sched.h>        /* for MAX_SCHEDULE_TIMEOUT */
-#include <linux/futex.h>        /* for FUTEX_WAIT */
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/security.h>
@@ -239,28 +238,6 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
        return ret;
 }
-#ifdef CONFIG_FUTEX
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
-                struct compat_timespec __user *utime, u32 __user *uaddr2,
-                int val3)
-{
-        struct timespec t;
-        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
-        int val2 = 0;
-        if ((op == FUTEX_WAIT) && utime) {
-                if (get_compat_timespec(&t, utime))
-                        return -EFAULT;
-                timeout = timespec_to_jiffies(&t) + 1;
-        }
-        if (op >= FUTEX_REQUEUE)
-                val2 = (int) (unsigned long) utime;
-        return do_futex((unsigned long)uaddr, op, val, timeout,
-                        (unsigned long)uaddr2, val2, val3);
-}
-#endif
 asmlinkage long compat_sys_setrlimit(unsigned int resource,
                struct compat_rlimit __user *rlim)
 {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8be22bd809..fe2b8d0bfe 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
 /* This protects CPUs going up and down... */
 static DECLARE_MUTEX(cpucontrol);
-static struct notifier_block *cpu_chain;
+static BLOCKING_NOTIFIER_HEAD(cpu_chain);
 #ifdef CONFIG_HOTPLUG_CPU
 static struct task_struct *lock_cpu_hotplug_owner;
@@ -71,21 +71,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
 /* Need to know about CPUs going up/down? */
 int register_cpu_notifier(struct notifier_block *nb)
 {
-        int ret;
+        return blocking_notifier_chain_register(&cpu_chain, nb);
-        if ((ret = lock_cpu_hotplug_interruptible()) != 0)
-                return ret;
-        ret = notifier_chain_register(&cpu_chain, nb);
-        unlock_cpu_hotplug();
-        return ret;
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
-        lock_cpu_hotplug();
+        blocking_notifier_chain_unregister(&cpu_chain, nb);
-        notifier_chain_unregister(&cpu_chain, nb);
-        unlock_cpu_hotplug();
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
@@ -141,7 +133,7 @@ int cpu_down(unsigned int cpu)
                goto out;
        }
-        err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+        err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
                                                (void *)(long)cpu);
        if (err == NOTIFY_BAD) {
                printk("%s: attempt to take down CPU %u failed\n",
@@ -159,7 +151,7 @@ int cpu_down(unsigned int cpu)
        p = __stop_machine_run(take_cpu_down, NULL, cpu);
        if (IS_ERR(p)) {
                /* CPU didn't die: tell everyone.  Can't complain. */
-                if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+                if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
                                (void *)(long)cpu) == NOTIFY_BAD)
                        BUG();
@@ -182,8 +174,8 @@ int cpu_down(unsigned int cpu)
        put_cpu();
        /* CPU is completely dead: tell everyone.  Too late to complain. */
-        if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu)
+        if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD,
-            == NOTIFY_BAD)
+                        (void *)(long)cpu) == NOTIFY_BAD)
                BUG();
        check_for_tasks(cpu);
@@ -211,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu)
                goto out;
        }
-        ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+        ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
        if (ret == NOTIFY_BAD) {
                printk("%s: attempt to bring up CPU %u failed\n",
                                __FUNCTION__, cpu);
@@ -226,11 +218,12 @@ int __devinit cpu_up(unsigned int cpu)
        BUG_ON(!cpu_online(cpu));
        /* Now call notifier in preparation. */
-        notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+        blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
 out_notify:
        if (ret != 0)
-                notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
+                blocking_notifier_call_chain(&cpu_chain,
+                                CPU_UP_CANCELED, hcpu);
 out:
        unlock_cpu_hotplug();
        return ret;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 18aea1bd12..ab81fdd457 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -616,12 +616,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 * current->cpuset if a task has its memory placement changed.
 * Do not call this routine if in_interrupt().
 *
- * Call without callback_mutex or task_lock() held.  May be called
+ * Call without callback_mutex or task_lock() held.  May be
- * with or without manage_mutex held.  Doesn't need task_lock to guard
+ * called with or without manage_mutex held.  Thanks in part to
- * against another task changing a non-NULL cpuset pointer to NULL,
+ * 'the_top_cpuset_hack', the tasks cpuset pointer will never
- * as that is only done by a task on itself, and if the current task
+ * be NULL.  This routine also might acquire callback_mutex and
- * is here, it is not simultaneously in the exit code NULL'ing its
- * cpuset pointer.  This routine also might acquire callback_mutex and
 * current->mm->mmap_sem during call.
 *
 * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -836,6 +834,55 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 }
 /*
+ * cpuset_migrate_mm
+ *
+ *    Migrate memory region from one set of nodes to another.
+ *
+ *    Temporarilly set tasks mems_allowed to target nodes of migration,
+ *    so that the migration code can allocate pages on these nodes.
+ *
+ *    Call holding manage_mutex, so our current->cpuset won't change
+ *    during this call, as manage_mutex holds off any attach_task()
+ *    calls.  Therefore we don't need to take task_lock around the
+ *    call to guarantee_online_mems(), as we know no one is changing
+ *    our tasks cpuset.
+ *
+ *    Hold callback_mutex around the two modifications of our tasks
+ *    mems_allowed to synchronize with cpuset_mems_allowed().
+ *
+ *    While the mm_struct we are migrating is typically from some
+ *    other task, the task_struct mems_allowed that we are hacking
+ *    is for our current task, which must allocate new pages for that
+ *    migrating memory region.
+ *
+ *    We call cpuset_update_task_memory_state() before hacking
+ *    our tasks mems_allowed, so that we are assured of being in
+ *    sync with our tasks cpuset, and in particular, callbacks to
+ *    cpuset_update_task_memory_state() from nested page allocations
+ *    won't see any mismatch of our cpuset and task mems_generation
+ *    values, so won't overwrite our hacked tasks mems_allowed
+ *    nodemask.
+ */
+static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+                                                        const nodemask_t *to)
+{
+        struct task_struct *tsk = current;
+        cpuset_update_task_memory_state();
+        mutex_lock(&callback_mutex);
+        tsk->mems_allowed = *to;
+        mutex_unlock(&callback_mutex);
+        do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+        mutex_lock(&callback_mutex);
+        guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed);
+        mutex_unlock(&callback_mutex);
+}
+/*
 * Handle user request to change the 'mems' memory placement
 * of a cpuset.  Needs to validate the request, update the
 * cpusets mems_allowed and mems_generation, and for each
@@ -947,10 +994,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                struct mm_struct *mm = mmarray[i];
                mpol_rebind_mm(mm, &cs->mems_allowed);
-                if (migrate) {
+                if (migrate)
-                        do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
+                        cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
-                                                        MPOL_MF_MOVE_ALL);
-                }
                mmput(mm);
        }
@@ -1185,11 +1230,11 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
        mm = get_task_mm(tsk);
        if (mm) {
                mpol_rebind_mm(mm, &to);
+                if (is_memory_migrate(cs))
+                        cpuset_migrate_mm(mm, &from, &to);
                mmput(mm);
        }
-        if (is_memory_migrate(cs))
-                do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
        put_task_struct(tsk);
        synchronize_rcu();
        if (atomic_dec_and_test(&oldcs->count))
@@ -2186,19 +2231,25 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
 * short of memory, might require taking the callback_mutex mutex.
 *
- * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
+ * The first call here from mm/page_alloc:get_page_from_freelist()
- * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
- * hardwall cpusets - no allocation on a node outside the cpuset is
+ * no allocation on a node outside the cpuset is allowed (unless in
- * allowed (unless in interrupt, of course).
+ * interrupt, of course).
 *
- * The second loop doesn't even call here for GFP_ATOMIC requests
+ * The second pass through get_page_from_freelist() doesn't even call
- * (if the __alloc_pages() local variable 'wait' is set).  That check
+ * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
- * and the checks below have the combined affect in the second loop of
+ * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
- * the __alloc_pages() routine that:
+ * in alloc_flags.  That logic and the checks below have the combined
+ * affect that:
 *      in_interrupt - any node ok (current task context irrelevant)
 *      GFP_ATOMIC   - any node ok
 *      GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
+ *
+ * Rule:
+ *    Don't call cpuset_zone_allowed() if you can't sleep, unless you
+ *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
+ *    the code that might scan up ancestor cpusets and sleep.
 **/
 int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
@@ -2210,6 +2261,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
        if (in_interrupt())
                return 1;
        node = z->zone_pgdat->node_id;
+        might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
        if (node_isset(node, current->mems_allowed))
                return 1;
        if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
diff --git a/kernel/exit.c b/kernel/exit.c
index 8037405e13..e95b932822 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -29,8 +29,13 @@
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <linux/posix-timers.h>
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
+#include <linux/futex.h>
+#include <linux/compat.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/audit.h> /* for audit_free() */
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -48,15 +53,85 @@ static void __unhash_process(struct task_struct *p)
 {
        nr_threads--;
        detach_pid(p, PIDTYPE_PID);
-        detach_pid(p, PIDTYPE_TGID);
        if (thread_group_leader(p)) {
                detach_pid(p, PIDTYPE_PGID);
                detach_pid(p, PIDTYPE_SID);
-                if (p->pid)
-                        __get_cpu_var(process_counts)--;
+                list_del_rcu(&p->tasks);
+                __get_cpu_var(process_counts)--;
        }
+        list_del_rcu(&p->thread_group);
+        remove_parent(p);
+}
+/*
+ * This function expects the tasklist_lock write-locked.
+ */
+static void __exit_signal(struct task_struct *tsk)
+{
+        struct signal_struct *sig = tsk->signal;
+        struct sighand_struct *sighand;
+        BUG_ON(!sig);
+        BUG_ON(!atomic_read(&sig->count));
+        rcu_read_lock();
+        sighand = rcu_dereference(tsk->sighand);
+        spin_lock(&sighand->siglock);
-        REMOVE_LINKS(p);
+        posix_cpu_timers_exit(tsk);
+        if (atomic_dec_and_test(&sig->count))
+                posix_cpu_timers_exit_group(tsk);
+        else {
+                /*
+                 * If there is any task waiting for the group exit
+                 * then notify it:
+                 */
+                if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
+                        wake_up_process(sig->group_exit_task);
+                        sig->group_exit_task = NULL;
+                }
+                if (tsk == sig->curr_target)
+                        sig->curr_target = next_thread(tsk);
+                /*
+                 * Accumulate here the counters for all threads but the
+                 * group leader as they die, so they can be added into
+                 * the process-wide totals when those are taken.
+                 * The group leader stays around as a zombie as long
+                 * as there are other threads.  When it gets reaped,
+                 * the exit.c code will add its counts into these totals.
+                 * We won't ever get here for the group leader, since it
+                 * will have been the last reference on the signal_struct.
+                 */
+                sig->utime = cputime_add(sig->utime, tsk->utime);
+                sig->stime = cputime_add(sig->stime, tsk->stime);
+                sig->min_flt += tsk->min_flt;
+                sig->maj_flt += tsk->maj_flt;
+                sig->nvcsw += tsk->nvcsw;
+                sig->nivcsw += tsk->nivcsw;
+                sig->sched_time += tsk->sched_time;
+                sig = NULL; /* Marker for below. */
+        }
+        __unhash_process(tsk);
+        tsk->signal = NULL;
+        tsk->sighand = NULL;
+        spin_unlock(&sighand->siglock);
+        rcu_read_unlock();
+        __cleanup_sighand(sighand);
+        clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+        flush_sigqueue(&tsk->pending);
+        if (sig) {
+                flush_sigqueue(&sig->shared_pending);
+                __cleanup_signal(sig);
+        }
+}
+static void delayed_put_task_struct(struct rcu_head *rhp)
+{
+        put_task_struct(container_of(rhp, struct task_struct, rcu));
 }
 void release_task(struct task_struct * p)
@@ -65,21 +140,14 @@ void release_task(struct task_struct * p)
        task_t *leader;
        struct dentry *proc_dentry;
-repeat: 
+repeat:
        atomic_dec(&p->user->processes);
        spin_lock(&p->proc_lock);
        proc_dentry = proc_pid_unhash(p);
        write_lock_irq(&tasklist_lock);
-        if (unlikely(p->ptrace))
+        ptrace_unlink(p);
-                __ptrace_unlink(p);
        BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
        __exit_signal(p);
-        /*
-         * Note that the fastpath in sys_times depends on __exit_signal having
-         * updated the counters before a task is removed from the tasklist of
-         * the process by __unhash_process.
-         */
-        __unhash_process(p);
        /*
         * If we are the last non-leader member of the thread
@@ -107,28 +175,13 @@ repeat:
        spin_unlock(&p->proc_lock);
        proc_pid_flush(proc_dentry);
        release_thread(p);
-        put_task_struct(p);
+        call_rcu(&p->rcu, delayed_put_task_struct);
        p = leader;
        if (unlikely(zap_leader))
                goto repeat;
 }
-/* we are using it only for SMP init */
-void unhash_process(struct task_struct *p)
-{
-        struct dentry *proc_dentry;
-        spin_lock(&p->proc_lock);
-        proc_dentry = proc_pid_unhash(p);
-        write_lock_irq(&tasklist_lock);
-        __unhash_process(p);
-        write_unlock_irq(&tasklist_lock);
-        spin_unlock(&p->proc_lock);
-        proc_pid_flush(proc_dentry);
-}
 /*
 * This checks not only the pgrp, but falls back on the pid if no
 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
@@ -236,10 +289,10 @@ static void reparent_to_init(void)
        ptrace_unlink(current);
        /* Reparent to init */
-        REMOVE_LINKS(current);
+        remove_parent(current);
        current->parent = child_reaper;
        current->real_parent = child_reaper;
-        SET_LINKS(current);
+        add_parent(current);
        /* Set the exit signal to SIGCHLD so we signal init on exit */
        current->exit_signal = SIGCHLD;
@@ -536,13 +589,13 @@ static void exit_mm(struct task_struct * tsk)
        mmput(mm);
 }
-static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+static inline void choose_new_parent(task_t *p, task_t *reaper)
 {
        /*
         * Make sure we're not reparenting to ourselves and that
         * the parent is not a zombie.
         */
-        BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE);
+        BUG_ON(p == reaper || reaper->exit_state);
        p->real_parent = reaper;
 }
@@ -567,9 +620,9 @@ static void reparent_thread(task_t *p, task_t *father, int traced)
                 * anyway, so let go of it.
                 */
                p->ptrace = 0;
-                list_del_init(&p->sibling);
+                remove_parent(p);
                p->parent = p->real_parent;
-                list_add_tail(&p->sibling, &p->parent->children);
+                add_parent(p);
                /* If we'd notified the old parent about this child's death,
                 * also notify the new parent.
@@ -643,7 +696,7 @@ static void forget_original_parent(struct task_struct * father,
                if (father == p->real_parent) {
                        /* reparent with a reaper, real father it's us */
-                        choose_new_parent(p, reaper, child_reaper);
+                        choose_new_parent(p, reaper);
                        reparent_thread(p, father, 0);
                } else {
                        /* reparent ptraced task to its real parent */
@@ -664,7 +717,7 @@ static void forget_original_parent(struct task_struct * father,
        }
        list_for_each_safe(_p, _n, &father->ptrace_children) {
                p = list_entry(_p,struct task_struct,ptrace_list);
-                choose_new_parent(p, reaper, child_reaper);
+                choose_new_parent(p, reaper);
                reparent_thread(p, father, 1);
        }
 }
@@ -805,7 +858,7 @@ fastcall NORET_TYPE void do_exit(long code)
                panic("Aiee, killing interrupt handler!");
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
-        if (unlikely(tsk->pid == 1))
+        if (unlikely(tsk == child_reaper))
                panic("Attempted to kill init!");
        if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
@@ -852,6 +905,14 @@ fastcall NORET_TYPE void do_exit(long code)
                exit_itimers(tsk->signal);
                acct_process(code);
        }
+        if (unlikely(tsk->robust_list))
+                exit_robust_list(tsk);
+#ifdef CONFIG_COMPAT
+        if (unlikely(tsk->compat_robust_list))
+                compat_exit_robust_list(tsk);
+#endif
+        if (unlikely(tsk->audit_context))
+                audit_free(tsk);
        exit_mm(tsk);
        exit_sem(tsk);
@@ -884,6 +945,9 @@ fastcall NORET_TYPE void do_exit(long code)
        if (tsk->io_context)
                exit_io_context();
+        if (tsk->splice_pipe)
+                __free_pipe_info(tsk->splice_pipe);
        /* PF_DEAD causes final put_task_struct after we schedule. */
        preempt_disable();
        BUG_ON(tsk->flags & PF_DEAD);
@@ -912,13 +976,6 @@ asmlinkage long sys_exit(int error_code)
        do_exit((error_code&0xff)<<8);
 }
-task_t fastcall *next_thread(const task_t *p)
-{
-        return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
-}
-EXPORT_SYMBOL(next_thread);
 /*
 * Take down every thread in the group.  This is called by fatal signals
 * as well as by sys_exit_group (below).
@@ -933,7 +990,6 @@ do_group_exit(int exit_code)
        else if (!thread_group_empty(current)) {
                struct signal_struct *const sig = current->signal;
                struct sighand_struct *const sighand = current->sighand;
-                read_lock(&tasklist_lock);
                spin_lock_irq(&sighand->siglock);
                if (sig->flags & SIGNAL_GROUP_EXIT)
                        /* Another thread got here before we took the lock.  */
@@ -943,7 +999,6 @@ do_group_exit(int exit_code)
                        zap_other_threads(current);
                }
                spin_unlock_irq(&sighand->siglock);
-                read_unlock(&tasklist_lock);
        }
        do_exit(exit_code);
@@ -1273,7 +1328,7 @@ bail_ref:
        /* move to end of parent's list to avoid starvation */
        remove_parent(p);
-        add_parent(p, p->parent);
+        add_parent(p);
        write_unlock_irq(&tasklist_lock);
diff --git a/kernel/extable.c b/kernel/extable.c
index 7501b531ce..7fe2628553 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,7 +40,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
        return e;
 }
-static int core_kernel_text(unsigned long addr)
+int core_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext &&
            addr <= (unsigned long)_etext)
diff --git a/kernel/fork.c b/kernel/fork.c
index e0a2b449de..ac8100e308 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -84,7 +84,7 @@ static kmem_cache_t *task_struct_cachep;
 #endif
 /* SLAB cache for signal_struct structures (tsk->signal) */
-kmem_cache_t *signal_cachep;
+static kmem_cache_t *signal_cachep;
 /* SLAB cache for sighand_struct structures (tsk->sighand) */
 kmem_cache_t *sighand_cachep;
@@ -108,16 +108,12 @@ void free_task(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(free_task);
-void __put_task_struct_cb(struct rcu_head *rhp)
+void __put_task_struct(struct task_struct *tsk)
 {
-        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
        WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
-        if (unlikely(tsk->audit_context))
-                audit_free(tsk);
        security_task_free(tsk);
        free_uid(tsk->user);
        put_group_info(tsk->group_info);
@@ -182,6 +178,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        atomic_set(&tsk->usage,2);
        atomic_set(&tsk->fs_excl, 0);
        tsk->btrace_seq = 0;
+        tsk->splice_pipe = NULL;
        return tsk;
 }
@@ -721,7 +718,7 @@ out_release:
        free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
        free_fd_array(new_fdt->fd, new_fdt->max_fds);
        kmem_cache_free(files_cachep, newf);
-        goto out;
+        return NULL;
 }
 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
@@ -786,14 +783,6 @@ int unshare_files(void)
 EXPORT_SYMBOL(unshare_files);
-void sighand_free_cb(struct rcu_head *rhp)
-{
-        struct sighand_struct *sp;
-        sp = container_of(rhp, struct sighand_struct, rcu);
-        kmem_cache_free(sighand_cachep, sp);
-}
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
        struct sighand_struct *sig;
@@ -806,12 +795,17 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
        rcu_assign_pointer(tsk->sighand, sig);
        if (!sig)
                return -ENOMEM;
-        spin_lock_init(&sig->siglock);
        atomic_set(&sig->count, 1);
        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
        return 0;
 }
+void __cleanup_sighand(struct sighand_struct *sighand)
+{
+        if (atomic_dec_and_test(&sighand->count))
+                kmem_cache_free(sighand_cachep, sighand);
+}
 static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
 {
        struct signal_struct *sig;
@@ -881,6 +875,22 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
        return 0;
 }
+void __cleanup_signal(struct signal_struct *sig)
+{
+        exit_thread_group_keys(sig);
+        kmem_cache_free(signal_cachep, sig);
+}
+static inline void cleanup_signal(struct task_struct *tsk)
+{
+        struct signal_struct *sig = tsk->signal;
+        atomic_dec(&sig->live);
+        if (atomic_dec_and_test(&sig->count))
+                __cleanup_signal(sig);
+}
 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long new_flags = p->flags;
@@ -1061,7 +1071,10 @@ static task_t *copy_process(unsigned long clone_flags,
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+        p->robust_list = NULL;
+#ifdef CONFIG_COMPAT
+        p->compat_robust_list = NULL;
+#endif
        /*
         * sigaltstack should be cleared when sharing the same VM
         */
@@ -1092,6 +1105,7 @@ static task_t *copy_process(unsigned long clone_flags,
         * We dont wake it up yet.
         */
        p->group_leader = p;
+        INIT_LIST_HEAD(&p->thread_group);
        INIT_LIST_HEAD(&p->ptrace_children);
        INIT_LIST_HEAD(&p->ptrace_list);
@@ -1115,16 +1129,6 @@ static task_t *copy_process(unsigned long clone_flags,
                        !cpu_online(task_cpu(p))))
                set_task_cpu(p, smp_processor_id());
-        /*
-         * Check for pending SIGKILL! The new thread should not be allowed
-         * to slip out of an OOM kill. (or normal SIGKILL.)
-         */
-        if (sigismember(&current->pending.signal, SIGKILL)) {
-                write_unlock_irq(&tasklist_lock);
-                retval = -EINTR;
-                goto bad_fork_cleanup_namespace;
-        }
        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
                p->real_parent = current->real_parent;
@@ -1133,6 +1137,23 @@ static task_t *copy_process(unsigned long clone_flags,
        p->parent = p->real_parent;
        spin_lock(&current->sighand->siglock);
+        /*
+         * Process group and session signals need to be delivered to just the
+         * parent before the fork or both the parent and the child after the
+         * fork. Restart if a signal comes in before we add the new process to
+         * it's process group.
+         * A fatal signal pending means that current will exit, so the new
+         * thread can't slip out of an OOM kill (or normal SIGKILL).
+         */
+        recalc_sigpending();
+        if (signal_pending(current)) {
+                spin_unlock(&current->sighand->siglock);
+                write_unlock_irq(&tasklist_lock);
+                retval = -ERESTARTNOINTR;
+                goto bad_fork_cleanup_namespace;
+        }
        if (clone_flags & CLONE_THREAD) {
                /*
                 * Important: if an exit-all has been started then
@@ -1145,17 +1166,9 @@ static task_t *copy_process(unsigned long clone_flags,
                        retval = -EAGAIN;
                        goto bad_fork_cleanup_namespace;
                }
-                p->group_leader = current->group_leader;
-                if (current->signal->group_stop_count > 0) {
+                p->group_leader = current->group_leader;
-                        /*
+                list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
-                         * There is an all-stop in progress for the group.
-                         * We ourselves will stop as soon as we check signals.
-                         * Make the new thread part of that group stop too.
-                         */
-                        current->signal->group_stop_count++;
-                        set_tsk_thread_flag(p, TIF_SIGPENDING);
-                }
                if (!cputime_eq(current->signal->it_virt_expires,
                                cputime_zero) ||
@@ -1178,23 +1191,25 @@ static task_t *copy_process(unsigned long clone_flags,
         */
        p->ioprio = current->ioprio;
-        SET_LINKS(p);
+        if (likely(p->pid)) {
-        if (unlikely(p->ptrace & PT_PTRACED))
+                add_parent(p);
-                __ptrace_link(p, current->parent);
+                if (unlikely(p->ptrace & PT_PTRACED))
+                        __ptrace_link(p, current->parent);
-        if (thread_group_leader(p)) {
-                p->signal->tty = current->signal->tty;
+                if (thread_group_leader(p)) {
-                p->signal->pgrp = process_group(current);
+                        p->signal->tty = current->signal->tty;
-                p->signal->session = current->signal->session;
+                        p->signal->pgrp = process_group(current);
-                attach_pid(p, PIDTYPE_PGID, process_group(p));
+                        p->signal->session = current->signal->session;
-                attach_pid(p, PIDTYPE_SID, p->signal->session);
+                        attach_pid(p, PIDTYPE_PGID, process_group(p));
-                if (p->pid)
+                        attach_pid(p, PIDTYPE_SID, p->signal->session);
+                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        __get_cpu_var(process_counts)++;
+                }
+                attach_pid(p, PIDTYPE_PID, p->pid);
+                nr_threads++;
        }
-        attach_pid(p, PIDTYPE_TGID, p->tgid);
-        attach_pid(p, PIDTYPE_PID, p->pid);
-        nr_threads++;
        total_forks++;
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
@@ -1209,9 +1224,9 @@ bad_fork_cleanup_mm:
        if (p->mm)
                mmput(p->mm);
 bad_fork_cleanup_signal:
-        exit_signal(p);
+        cleanup_signal(p);
 bad_fork_cleanup_sighand:
-        exit_sighand(p);
+        __cleanup_sighand(p->sighand);
 bad_fork_cleanup_fs:
        exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
@@ -1258,7 +1273,7 @@ task_t * __devinit fork_idle(int cpu)
        if (!task)
                return ERR_PTR(-ENOMEM);
        init_idle(task, cpu);
-        unhash_process(task);
        return task;
 }
@@ -1293,17 +1308,19 @@ long do_fork(unsigned long clone_flags,
 {
        struct task_struct *p;
        int trace = 0;
-        long pid = alloc_pidmap();
+        struct pid *pid = alloc_pid();
+        long nr;
-        if (pid < 0)
+        if (!pid)
                return -EAGAIN;
+        nr = pid->nr;
        if (unlikely(current->ptrace)) {
                trace = fork_traceflag (clone_flags);
                if (trace)
                        clone_flags |= CLONE_PTRACE;
        }
-        p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
+        p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
@@ -1330,7 +1347,7 @@ long do_fork(unsigned long clone_flags,
                        p->state = TASK_STOPPED;
                if (unlikely (trace)) {
-                        current->ptrace_message = pid;
+                        current->ptrace_message = nr;
                        ptrace_notify ((trace << 8) | SIGTRAP);
                }
@@ -1340,21 +1357,31 @@ long do_fork(unsigned long clone_flags,
                                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
                }
        } else {
-                free_pidmap(pid);
+                free_pid(pid);
-                pid = PTR_ERR(p);
+                nr = PTR_ERR(p);
        }
-        return pid;
+        return nr;
 }
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
+static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+{
+        struct sighand_struct *sighand = data;
+        if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+                                        SLAB_CTOR_CONSTRUCTOR)
+                spin_lock_init(&sighand->siglock);
+}
 void __init proc_caches_init(void)
 {
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+                        sighand_ctor, NULL);
        signal_cachep = kmem_cache_create("signal_cache",
                        sizeof(struct signal_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5efa2f9780..5699c51205 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -8,6 +8,10 @@
 *  Removed page pinning, fix privately mapped COW pages and other cleanups
 *  (C) Copyright 2003, 2004 Jamie Lokier
 *
+ *  Robust futex support started by Ingo Molnar
+ *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+ *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+ *
 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
 *  enough at me, Linus for the original (flawed) idea, Matthew
 *  Kirkwood for proof-of-concept implementation.
@@ -829,6 +833,172 @@ error:
        goto out;
 }
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ *
+ * Implementation: user-space maintains a per-thread list of locks it
+ * is holding. Upon do_exit(), the kernel carefully walks this list,
+ * and marks all locks that are owned by this thread with the
+ * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
+ * always manipulated with the lock held, so the list is private and
+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+ * field, to allow the kernel to clean up if the thread dies after
+ * acquiring the lock, but just before it could have added itself to
+ * the list. There can only be one such pending lock.
+ */
+/**
+ * sys_set_robust_list - set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+asmlinkage long
+sys_set_robust_list(struct robust_list_head __user *head,
+                    size_t len)
+{
+        /*
+         * The kernel knows only one size for now:
+         */
+        if (unlikely(len != sizeof(*head)))
+                return -EINVAL;
+        current->robust_list = head;
+        return 0;
+}
+/**
+ * sys_get_robust_list - get the robust-futex list head of a task
+ * @pid: pid of the process [zero for current task]
+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
+ * @len_ptr: pointer to a length field, the kernel fills in the header size
+ */
+asmlinkage long
+sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+                    size_t __user *len_ptr)
+{
+        struct robust_list_head *head;
+        unsigned long ret;
+        if (!pid)
+                head = current->robust_list;
+        else {
+                struct task_struct *p;
+                ret = -ESRCH;
+                read_lock(&tasklist_lock);
+                p = find_task_by_pid(pid);
+                if (!p)
+                        goto err_unlock;
+                ret = -EPERM;
+                if ((current->euid != p->euid) && (current->euid != p->uid) &&
+                                !capable(CAP_SYS_PTRACE))
+                        goto err_unlock;
+                head = p->robust_list;
+                read_unlock(&tasklist_lock);
+        }
+        if (put_user(sizeof(*head), len_ptr))
+                return -EFAULT;
+        return put_user(head, head_ptr);
+err_unlock:
+        read_unlock(&tasklist_lock);
+        return ret;
+}
+/*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
+{
+        u32 uval;
+retry:
+        if (get_user(uval, uaddr))
+                return -1;
+        if ((uval & FUTEX_TID_MASK) == curr->pid) {
+                /*
+                 * Ok, this dying thread is truly holding a futex
+                 * of interest. Set the OWNER_DIED bit atomically
+                 * via cmpxchg, and if the value had FUTEX_WAITERS
+                 * set, wake up a waiter (if any). (We have to do a
+                 * futex_wake() even if OWNER_DIED is already set -
+                 * to handle the rare but possible case of recursive
+                 * thread-death.) The rest of the cleanup is done in
+                 * userspace.
+                 */
+                if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
+                                         uval | FUTEX_OWNER_DIED) != uval)
+                        goto retry;
+                if (uval & FUTEX_WAITERS)
+                        futex_wake((unsigned long)uaddr, 1);
+        }
+        return 0;
+}
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void exit_robust_list(struct task_struct *curr)
+{
+        struct robust_list_head __user *head = curr->robust_list;
+        struct robust_list __user *entry, *pending;
+        unsigned int limit = ROBUST_LIST_LIMIT;
+        unsigned long futex_offset;
+        /*
+         * Fetch the list head (which was registered earlier, via
+         * sys_set_robust_list()):
+         */
+        if (get_user(entry, &head->list.next))
+                return;
+        /*
+         * Fetch the relative futex offset:
+         */
+        if (get_user(futex_offset, &head->futex_offset))
+                return;
+        /*
+         * Fetch any possibly pending lock-add first, and handle it
+         * if it exists:
+         */
+        if (get_user(pending, &head->list_op_pending))
+                return;
+        if (pending)
+                handle_futex_death((void *)pending + futex_offset, curr);
+        while (entry != &head->list) {
+                /*
+                 * A pending lock might already be on the list, so
+                 * dont process it twice:
+                 */
+                if (entry != pending)
+                        if (handle_futex_death((void *)entry + futex_offset,
+                                                curr))
+                                return;
+                /*
+                 * Fetch the next entry in the list:
+                 */
+                if (get_user(entry, &entry->next))
+                        return;
+                /*
+                 * Avoid excessively long or circular lists:
+                 */
+                if (!--limit)
+                        break;
+                cond_resched();
+        }
+}
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
                unsigned long uaddr2, int val2, int val3)
 {
@@ -869,9 +1039,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
        int val2 = 0;
-        if ((op == FUTEX_WAIT) && utime) {
+        if (utime && (op == FUTEX_WAIT)) {
                if (copy_from_user(&t, utime, sizeof(t)) != 0)
                        return -EFAULT;
+                if (!timespec_valid(&t))
+                        return -EINVAL;
                timeout = timespec_to_jiffies(&t) + 1;
        }
        /*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
new file mode 100644
index 0000000000..1ab6a0ea3d
--- /dev/null
+++ b/kernel/futex_compat.c
@@ -0,0 +1,144 @@
+/*
+ * linux/kernel/futex_compat.c
+ *
+ * Futex compatibililty routines.
+ *
+ * Copyright 2006, Red Hat, Inc., Ingo Molnar
+ */
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/futex.h>
+#include <asm/uaccess.h>
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+        struct compat_robust_list_head __user *head = curr->compat_robust_list;
+        struct robust_list __user *entry, *pending;
+        compat_uptr_t uentry, upending;
+        unsigned int limit = ROBUST_LIST_LIMIT;
+        compat_long_t futex_offset;
+        /*
+         * Fetch the list head (which was registered earlier, via
+         * sys_set_robust_list()):
+         */
+        if (get_user(uentry, &head->list.next))
+                return;
+        entry = compat_ptr(uentry);
+        /*
+         * Fetch the relative futex offset:
+         */
+        if (get_user(futex_offset, &head->futex_offset))
+                return;
+        /*
+         * Fetch any possibly pending lock-add first, and handle it
+         * if it exists:
+         */
+        if (get_user(upending, &head->list_op_pending))
+                return;
+        pending = compat_ptr(upending);
+        if (upending)
+                handle_futex_death((void *)pending + futex_offset, curr);
+        while (compat_ptr(uentry) != &head->list) {
+                /*
+                 * A pending lock might already be on the list, so
+                 * dont process it twice:
+                 */
+                if (entry != pending)
+                        if (handle_futex_death((void *)entry + futex_offset,
+                                                curr))
+                                return;
+                /*
+                 * Fetch the next entry in the list:
+                 */
+                if (get_user(uentry, (compat_uptr_t *)&entry->next))
+                        return;
+                entry = compat_ptr(uentry);
+                /*
+                 * Avoid excessively long or circular lists:
+                 */
+                if (!--limit)
+                        break;
+                cond_resched();
+        }
+}
+asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+                           compat_size_t len)
+{
+        if (unlikely(len != sizeof(*head)))
+                return -EINVAL;
+        current->compat_robust_list = head;
+        return 0;
+}
+asmlinkage long
+compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
+                           compat_size_t __user *len_ptr)
+{
+        struct compat_robust_list_head *head;
+        unsigned long ret;
+        if (!pid)
+                head = current->compat_robust_list;
+        else {
+                struct task_struct *p;
+                ret = -ESRCH;
+                read_lock(&tasklist_lock);
+                p = find_task_by_pid(pid);
+                if (!p)
+                        goto err_unlock;
+                ret = -EPERM;
+                if ((current->euid != p->euid) && (current->euid != p->uid) &&
+                                !capable(CAP_SYS_PTRACE))
+                        goto err_unlock;
+                head = p->compat_robust_list;
+                read_unlock(&tasklist_lock);
+        }
+        if (put_user(sizeof(*head), len_ptr))
+                return -EFAULT;
+        return put_user(ptr_to_compat(head), head_ptr);
+err_unlock:
+        read_unlock(&tasklist_lock);
+        return ret;
+}
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
+                struct compat_timespec __user *utime, u32 __user *uaddr2,
+                u32 val3)
+{
+        struct timespec t;
+        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+        int val2 = 0;
+        if (utime && (op == FUTEX_WAIT)) {
+                if (get_compat_timespec(&t, utime))
+                        return -EFAULT;
+                if (!timespec_valid(&t))
+                        return -EINVAL;
+                timeout = timespec_to_jiffies(&t) + 1;
+        }
+        if (op >= FUTEX_REQUEUE)
+                val2 = (int) (unsigned long) utime;
+        return do_futex((unsigned long)uaddr, op, val, timeout,
+                        (unsigned long)uaddr2, val2, val3);
+}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0237a556eb..01fa2ae98a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -456,6 +456,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
        return ret;
 }
+EXPORT_SYMBOL_GPL(hrtimer_start);
 /**
 * hrtimer_try_to_cancel - try to deactivate a timer
@@ -484,6 +485,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
        return ret;
 }
+EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
 /**
 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
@@ -501,8 +503,10 @@ int hrtimer_cancel(struct hrtimer *timer)
                if (ret >= 0)
                        return ret;
+                cpu_relax();
        }
 }
+EXPORT_SYMBOL_GPL(hrtimer_cancel);
 /**
 * hrtimer_get_remaining - get remaining time for the timer
@@ -521,6 +525,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
        return rem;
 }
+EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 #ifdef CONFIG_NO_IDLE_HZ
 /**
@@ -579,6 +584,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
        timer->base = &bases[clock_id];
        timer->node.rb_parent = HRTIMER_INACTIVE;
 }
+EXPORT_SYMBOL_GPL(hrtimer_init);
 /**
 * hrtimer_get_res - get the timer resolution for a clock
@@ -598,6 +604,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
        return 0;
 }
+EXPORT_SYMBOL_GPL(hrtimer_get_res);
 /*
 * Expire the per base hrtimer-queue:
@@ -606,6 +613,9 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 {
        struct rb_node *node;
+        if (!base->first)
+                return;
        if (base->get_softirq_time)
                base->softirq_time = base->get_softirq_time();
@@ -655,29 +665,28 @@ void hrtimer_run_queues(void)
 /*
 * Sleep related functions:
 */
+static int hrtimer_wakeup(struct hrtimer *timer)
-struct sleep_hrtimer {
-        struct hrtimer timer;
-        struct task_struct *task;
-        int expired;
-};
-static int nanosleep_wakeup(struct hrtimer *timer)
 {
-        struct sleep_hrtimer *t =
+        struct hrtimer_sleeper *t =
-                container_of(timer, struct sleep_hrtimer, timer);
+                container_of(timer, struct hrtimer_sleeper, timer);
+        struct task_struct *task = t->task;
-        t->expired = 1;
+        t->task = NULL;
-        wake_up_process(t->task);
+        if (task)
+                wake_up_process(task);
        return HRTIMER_NORESTART;
 }
-static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
 {
-        t->timer.function = nanosleep_wakeup;
+        sl->timer.function = hrtimer_wakeup;
-        t->task = current;
+        sl->task = task;
-        t->expired = 0;
+}
+static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
+{
+        hrtimer_init_sleeper(t, current);
        do {
                set_current_state(TASK_INTERRUPTIBLE);
@@ -685,18 +694,17 @@ static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
                schedule();
-                if (unlikely(!t->expired)) {
+                hrtimer_cancel(&t->timer);
-                        hrtimer_cancel(&t->timer);
+                mode = HRTIMER_ABS;
-                        mode = HRTIMER_ABS;
-                }
+        } while (t->task && !signal_pending(current));
-        } while (!t->expired && !signal_pending(current));
-        return t->expired;
+        return t->task == NULL;
 }
 static long __sched nanosleep_restart(struct restart_block *restart)
 {
-        struct sleep_hrtimer t;
+        struct hrtimer_sleeper t;
        struct timespec __user *rmtp;
        struct timespec tu;
        ktime_t time;
@@ -729,7 +737,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
                       const enum hrtimer_mode mode, const clockid_t clockid)
 {
        struct restart_block *restart;
-        struct sleep_hrtimer t;
+        struct hrtimer_sleeper t;
        struct timespec tu;
        ktime_t rem;
@@ -834,7 +842,7 @@ static void migrate_hrtimers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
+static int hrtimer_cpu_notify(struct notifier_block *self,
                                        unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -858,7 +866,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata hrtimers_nb = {
+static struct notifier_block hrtimers_nb = {
        .notifier_call = hrtimer_cpu_notify,
 };
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2b33f852be..9f77f50d81 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,4 +1,5 @@
-obj-y := handle.o manage.o spurious.o migration.o
+obj-y := handle.o manage.o spurious.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac766ad573..1279e34995 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -246,8 +246,10 @@ int setup_irq(unsigned int irq, struct irqaction * new)
 mismatch:
        spin_unlock_irqrestore(&desc->lock, flags);
-        printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
+        if (!(new->flags & SA_PROBEIRQ)) {
-        dump_stack();
+                printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
+                dump_stack();
+        }
        return -EBUSY;
 }
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 52a8655fa0..134f9f2e0e 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,6 +1,5 @@
-#include <linux/irq.h>
-#if defined(CONFIG_GENERIC_PENDING_IRQ)
+#include <linux/irq.h>
 void set_pending_irq(unsigned int irq, cpumask_t mask)
 {
@@ -61,5 +60,3 @@ void move_native_irq(int irq)
        }
        cpus_clear(pending_irq_cpumask[irq]);
 }
-#endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 51a892063a..20a997c73c 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -170,7 +170,7 @@ static int wait_for_helper(void *data)
        sa.sa.sa_handler = SIG_IGN;
        sa.sa.sa_flags = 0;
        siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
-        do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+        do_sigaction(SIGCHLD, &sa, NULL);
        allow_signal(SIGCHLD);
        pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1156eb0977..1fbf466a29 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -585,6 +585,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
        int i;
        rp->kp.pre_handler = pre_handler_kretprobe;
+        rp->kp.post_handler = NULL;
+        rp->kp.fault_handler = NULL;
+        rp->kp.break_handler = NULL;
        /* Pre-allocate memory for max kretprobe instances */
        if (rp->maxactive <= 0) {
diff --git a/kernel/module.c b/kernel/module.c
index ddfe45ac2f..bbe04862e1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -64,26 +64,17 @@ static DEFINE_SPINLOCK(modlist_lock);
 static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
-static DEFINE_MUTEX(notify_mutex);
+static BLOCKING_NOTIFIER_HEAD(module_notify_list);
-static struct notifier_block * module_notify_list;
 int register_module_notifier(struct notifier_block * nb)
 {
-        int err;
+        return blocking_notifier_chain_register(&module_notify_list, nb);
-        mutex_lock(&notify_mutex);
-        err = notifier_chain_register(&module_notify_list, nb);
-        mutex_unlock(&notify_mutex);
-        return err;
 }
 EXPORT_SYMBOL(register_module_notifier);
 int unregister_module_notifier(struct notifier_block * nb)
 {
-        int err;
+        return blocking_notifier_chain_unregister(&module_notify_list, nb);
-        mutex_lock(&notify_mutex);
-        err = notifier_chain_unregister(&module_notify_list, nb);
-        mutex_unlock(&notify_mutex);
-        return err;
 }
 EXPORT_SYMBOL(unregister_module_notifier);
@@ -136,7 +127,7 @@ extern const unsigned long __start___kcrctab_gpl_future[];
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
 #else
-#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL)
+#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 /* lookup symbol in given range of kernel_symbols */
@@ -714,14 +705,14 @@ EXPORT_SYMBOL(__symbol_put);
 void symbol_put_addr(void *addr)
 {
-        unsigned long flags;
+        struct module *modaddr;
-        spin_lock_irqsave(&modlist_lock, flags);
+        if (core_kernel_text((unsigned long)addr))
-        if (!kernel_text_address((unsigned long)addr))
+                return;
-                BUG();
-        module_put(module_text_address((unsigned long)addr));
+        if (!(modaddr = module_text_address((unsigned long)addr)))
-        spin_unlock_irqrestore(&modlist_lock, flags);
+                BUG();
+        module_put(modaddr);
 }
 EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -1263,6 +1254,7 @@ static inline int license_is_gpl_compatible(const char *license)
                || strcmp(license, "GPL v2") == 0
                || strcmp(license, "GPL and additional rights") == 0
                || strcmp(license, "Dual BSD/GPL") == 0
+                || strcmp(license, "Dual MIT/GPL") == 0
                || strcmp(license, "Dual MPL/GPL") == 0);
 }
@@ -1816,9 +1808,8 @@ sys_init_module(void __user *umod,
        /* Drop lock so they can recurse */
        mutex_unlock(&module_mutex);
-        mutex_lock(&notify_mutex);
+        blocking_notifier_call_chain(&module_notify_list,
-        notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
+                        MODULE_STATE_COMING, mod);
-        mutex_unlock(&notify_mutex);
        /* Start the module */
        if (mod->init != NULL)
diff --git a/kernel/panic.c b/kernel/panic.c
index acd95adddb..cc2a4c9c36 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,9 +27,8 @@ static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
 int panic_timeout;
-EXPORT_SYMBOL(panic_timeout);
-struct notifier_block *panic_notifier_list;
+ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 EXPORT_SYMBOL(panic_notifier_list);
@@ -97,7 +96,7 @@ NORET_TYPE void panic(const char * fmt, ...)
        smp_send_stop();
 #endif
-        notifier_call_chain(&panic_notifier_list, 0, buf);
+        atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
        if (!panic_blink)
                panic_blink = no_blink;
diff --git a/kernel/params.c b/kernel/params.c
index 9de637a5c8..af43ecdc8d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,7 +31,7 @@
 #define DEBUGP(fmt, a...)
 #endif
-static inline int dash2underscore(char c)
+static inline char dash2underscore(char c)
 {
        if (c == '-')
                return '_';
diff --git a/kernel/pid.c b/kernel/pid.c
index 1acc072469..eeb836b65c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -28,8 +28,9 @@
 #include <linux/hash.h>
 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
-static struct hlist_head *pid_hash[PIDTYPE_MAX];
+static struct hlist_head *pid_hash;
 static int pidhash_shift;
+static kmem_cache_t *pid_cachep;
 int pid_max = PID_MAX_DEFAULT;
 int last_pid;
@@ -60,9 +61,22 @@ typedef struct pidmap {
 static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
         { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
+/*
+ * Note: disable interrupts while the pidmap_lock is held as an
+ * interrupt might come in and do read_lock(&tasklist_lock).
+ *
+ * If we don't disable interrupts there is a nasty deadlock between
+ * detach_pid()->free_pid() and another cpu that does
+ * spin_lock(&pidmap_lock) followed by an interrupt routine that does
+ * read_lock(&tasklist_lock);
+ *
+ * After we clean up the tasklist_lock and know there are no
+ * irq handlers that take it we can leave the interrupts enabled.
+ * For now it is easier to be safe than to prove it can't happen.
+ */
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-fastcall void free_pidmap(int pid)
+static fastcall void free_pidmap(int pid)
 {
        pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
        int offset = pid & BITS_PER_PAGE_MASK;
@@ -71,7 +85,7 @@ fastcall void free_pidmap(int pid)
        atomic_inc(&map->nr_free);
 }
-int alloc_pidmap(void)
+static int alloc_pidmap(void)
 {
        int i, offset, max_scan, pid, last = last_pid;
        pidmap_t *map;
@@ -89,12 +103,12 @@ int alloc_pidmap(void)
                         * Free the page if someone raced with us
                         * installing it:
                         */
-                        spin_lock(&pidmap_lock);
+                        spin_lock_irq(&pidmap_lock);
                        if (map->page)
                                free_page(page);
                        else
                                map->page = (void *)page;
-                        spin_unlock(&pidmap_lock);
+                        spin_unlock_irq(&pidmap_lock);
                        if (unlikely(!map->page))
                                break;
                }
@@ -131,13 +145,73 @@ int alloc_pidmap(void)
        return -1;
 }
-struct pid * fastcall find_pid(enum pid_type type, int nr)
+fastcall void put_pid(struct pid *pid)
+{
+        if (!pid)
+                return;
+        if ((atomic_read(&pid->count) == 1) ||
+             atomic_dec_and_test(&pid->count))
+                kmem_cache_free(pid_cachep, pid);
+}
+static void delayed_put_pid(struct rcu_head *rhp)
+{
+        struct pid *pid = container_of(rhp, struct pid, rcu);
+        put_pid(pid);
+}
+fastcall void free_pid(struct pid *pid)
+{
+        /* We can be called with write_lock_irq(&tasklist_lock) held */
+        unsigned long flags;
+        spin_lock_irqsave(&pidmap_lock, flags);
+        hlist_del_rcu(&pid->pid_chain);
+        spin_unlock_irqrestore(&pidmap_lock, flags);
+        free_pidmap(pid->nr);
+        call_rcu(&pid->rcu, delayed_put_pid);
+}
+struct pid *alloc_pid(void)
+{
+        struct pid *pid;
+        enum pid_type type;
+        int nr = -1;
+        pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
+        if (!pid)
+                goto out;
+        nr = alloc_pidmap();
+        if (nr < 0)
+                goto out_free;
+        atomic_set(&pid->count, 1);
+        pid->nr = nr;
+        for (type = 0; type < PIDTYPE_MAX; ++type)
+                INIT_HLIST_HEAD(&pid->tasks[type]);
+        spin_lock_irq(&pidmap_lock);
+        hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
+        spin_unlock_irq(&pidmap_lock);
+out:
+        return pid;
+out_free:
+        kmem_cache_free(pid_cachep, pid);
+        pid = NULL;
+        goto out;
+}
+struct pid * fastcall find_pid(int nr)
 {
        struct hlist_node *elem;
        struct pid *pid;
        hlist_for_each_entry_rcu(pid, elem,
-                        &pid_hash[type][pid_hashfn(nr)], pid_chain) {
+                        &pid_hash[pid_hashfn(nr)], pid_chain) {
                if (pid->nr == nr)
                        return pid;
        }
@@ -146,105 +220,80 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
 int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
 {
-        struct pid *pid, *task_pid;
+        struct pid_link *link;
+        struct pid *pid;
-        task_pid = &task->pids[type];
-        pid = find_pid(type, nr);
-        task_pid->nr = nr;
-        if (pid == NULL) {
-                INIT_LIST_HEAD(&task_pid->pid_list);
-                hlist_add_head_rcu(&task_pid->pid_chain,
-                                   &pid_hash[type][pid_hashfn(nr)]);
-        } else {
-                INIT_HLIST_NODE(&task_pid->pid_chain);
-                list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
-        }
-        return 0;
-}
-static fastcall int __detach_pid(task_t *task, enum pid_type type)
-{
-        struct pid *pid, *pid_next;
-        int nr = 0;
-        pid = &task->pids[type];
-        if (!hlist_unhashed(&pid->pid_chain)) {
-                if (list_empty(&pid->pid_list)) {
+        WARN_ON(!task->pid); /* to be removed soon */
-                        nr = pid->nr;
+        WARN_ON(!nr); /* to be removed soon */
-                        hlist_del_rcu(&pid->pid_chain);
-                } else {
-                        pid_next = list_entry(pid->pid_list.next,
-                                                struct pid, pid_list);
-                        /* insert next pid from pid_list to hash */
-                        hlist_replace_rcu(&pid->pid_chain,
-                                          &pid_next->pid_chain);
-                }
-        }
-        list_del_rcu(&pid->pid_list);
+        link = &task->pids[type];
-        pid->nr = 0;
+        link->pid = pid = find_pid(nr);
+        hlist_add_head_rcu(&link->node, &pid->tasks[type]);
-        return nr;
+        return 0;
 }
 void fastcall detach_pid(task_t *task, enum pid_type type)
 {
-        int tmp, nr;
+        struct pid_link *link;
+        struct pid *pid;
+        int tmp;
-        nr = __detach_pid(task, type);
+        link = &task->pids[type];
-        if (!nr)
+        pid = link->pid;
-                return;
+        hlist_del_rcu(&link->node);
+        link->pid = NULL;
        for (tmp = PIDTYPE_MAX; --tmp >= 0; )
-                if (tmp != type && find_pid(tmp, nr))
+                if (!hlist_empty(&pid->tasks[tmp]))
                        return;
-        free_pidmap(nr);
+        free_pid(pid);
 }
-task_t *find_task_by_pid_type(int type, int nr)
+struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
 {
-        struct pid *pid;
+        struct task_struct *result = NULL;
+        if (pid) {
-        pid = find_pid(type, nr);
+                struct hlist_node *first;
-        if (!pid)
+                first = rcu_dereference(pid->tasks[type].first);
-                return NULL;
+                if (first)
+                        result = hlist_entry(first, struct task_struct, pids[(type)].node);
+        }
+        return result;
+}
-        return pid_task(&pid->pid_list, type);
+/*
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
+ */
+task_t *find_task_by_pid_type(int type, int nr)
+{
+        return pid_task(find_pid(nr), type);
 }
 EXPORT_SYMBOL(find_task_by_pid_type);
-/*
+struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
- * This function switches the PIDs if a non-leader thread calls
+{
- * sys_execve() - this must be done without releasing the PID.
+        struct task_struct *result;
- * (which a detach_pid() would eventually do.)
+        rcu_read_lock();
- */
+        result = pid_task(pid, type);
-void switch_exec_pids(task_t *leader, task_t *thread)
+        if (result)
+                get_task_struct(result);
+        rcu_read_unlock();
+        return result;
+}
+struct pid *find_get_pid(pid_t nr)
 {
-        __detach_pid(leader, PIDTYPE_PID);
+        struct pid *pid;
-        __detach_pid(leader, PIDTYPE_TGID);
-        __detach_pid(leader, PIDTYPE_PGID);
+        rcu_read_lock();
-        __detach_pid(leader, PIDTYPE_SID);
+        pid = get_pid(find_pid(nr));
+        rcu_read_unlock();
-        __detach_pid(thread, PIDTYPE_PID);
-        __detach_pid(thread, PIDTYPE_TGID);
+        return pid;
-        leader->pid = leader->tgid = thread->pid;
-        thread->pid = thread->tgid;
-        attach_pid(thread, PIDTYPE_PID, thread->pid);
-        attach_pid(thread, PIDTYPE_TGID, thread->tgid);
-        attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
-        attach_pid(thread, PIDTYPE_SID, thread->signal->session);
-        list_add_tail(&thread->tasks, &init_task.tasks);
-        attach_pid(leader, PIDTYPE_PID, leader->pid);
-        attach_pid(leader, PIDTYPE_TGID, leader->tgid);
-        attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
-        attach_pid(leader, PIDTYPE_SID, leader->signal->session);
 }
 /*
@@ -254,7 +303,7 @@ void switch_exec_pids(task_t *leader, task_t *thread)
 */
 void __init pidhash_init(void)
 {
-        int i, j, pidhash_size;
+        int i, pidhash_size;
        unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
        pidhash_shift = max(4, fls(megabytes * 4));
@@ -263,30 +312,23 @@ void __init pidhash_init(void)
        printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
                pidhash_size, pidhash_shift,
-                PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head));
+                pidhash_size * sizeof(struct hlist_head));
-        for (i = 0; i < PIDTYPE_MAX; i++) {
+        pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
-                pid_hash[i] = alloc_bootmem(pidhash_size *
+        if (!pid_hash)
-                                        sizeof(*(pid_hash[i])));
+                panic("Could not alloc pidhash!\n");
-                if (!pid_hash[i])
+        for (i = 0; i < pidhash_size; i++)
-                        panic("Could not alloc pidhash!\n");
+                INIT_HLIST_HEAD(&pid_hash[i]);
-                for (j = 0; j < pidhash_size; j++)
-                        INIT_HLIST_HEAD(&pid_hash[i][j]);
-        }
 }
 void __init pidmap_init(void)
 {
-        int i;
        pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
+        /* Reserve PID 0. We never call free_pidmap(0) */
        set_bit(0, pidmap_array->page);
        atomic_dec(&pidmap_array->nr_free);
-        /*
+        pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
-         * Allocate PID 0, and hash it via all PID types:
+                                        __alignof__(struct pid),
-         */
+                                        SLAB_PANIC, NULL, NULL);
-        for (i = 0; i < PIDTYPE_MAX; i++)
-                attach_pid(current, i, 0);
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9fd8d4f035..ce0dfb8f4a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -41,7 +41,7 @@ config SOFTWARE_SUSPEND
        depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
        ---help---
          Enable the possibility of suspending the machine.
-          It doesn't need APM.
+          It doesn't need ACPI or APM.
          You may suspend your machine by 'swsusp' or 'shutdown -z <time>' 
          (patch for sysvinit needed). 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index ee371f50cc..a6d9ef4600 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -272,7 +272,7 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
                if (*s && !strncmp(buf, *s, len))
                        break;
        }
-        if (*s)
+        if (state < PM_SUSPEND_MAX && *s)
                error = enter_state(state);
        else
                error = -EINVAL;
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 0f6908cce1..84063ac8fc 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -75,25 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type,
        return dev;
 }
-/**
- *      pm_unregister -  unregister a device with power management
- *      @dev: device to unregister
- *
- *      Remove a device from the power management notification lists. The
- *      dev passed must be a handle previously returned by pm_register.
- */
- 
-void pm_unregister(struct pm_dev *dev)
-{
-        if (dev) {
-                mutex_lock(&pm_devs_lock);
-                list_del(&dev->entry);
-                mutex_unlock(&pm_devs_lock);
-                kfree(dev);
-        }
-}
 static void __pm_unregister(struct pm_dev *dev)
 {
        if (dev) {
@@ -258,7 +239,6 @@ int pm_send_all(pm_request_t rqst, void *data)
 }
 EXPORT_SYMBOL(pm_register);
-EXPORT_SYMBOL(pm_unregister);
 EXPORT_SYMBOL(pm_unregister_all);
 EXPORT_SYMBOL(pm_send_all);
 EXPORT_SYMBOL(pm_active);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8ac7c35fad..b2a5f671d6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -26,8 +26,7 @@ static inline int freezeable(struct task_struct * p)
            (p->flags & PF_NOFREEZE) ||
            (p->exit_state == EXIT_ZOMBIE) ||
            (p->exit_state == EXIT_DEAD) ||
-            (p->state == TASK_STOPPED) ||
+            (p->state == TASK_STOPPED))
-            (p->state == TASK_TRACED))
                return 0;
        return 1;
 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index c5863d02c8..3eeedbb13b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -240,14 +240,15 @@ static void copy_data_pages(struct pbe *pblist)
 *      free_pagedir - free pages allocated with alloc_pagedir()
 */
-static void free_pagedir(struct pbe *pblist)
+static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
 {
        struct pbe *pbe;
        while (pblist) {
                pbe = (pblist + PB_PAGE_SKIP)->next;
                ClearPageNosave(virt_to_page(pblist));
-                ClearPageNosaveFree(virt_to_page(pblist));
+                if (clear_nosave_free)
+                        ClearPageNosaveFree(virt_to_page(pblist));
                free_page((unsigned long)pblist);
                pblist = pbe;
        }
@@ -389,7 +390,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
                pbe->next = alloc_image_page(gfp_mask, safe_needed);
        }
        if (!pbe) { /* get_zeroed_page() failed */
-                free_pagedir(pblist);
+                free_pagedir(pblist, 1);
                pblist = NULL;
        } else
                create_pbe_list(pblist, nr_pages);
@@ -736,7 +737,7 @@ static int create_image(struct snapshot_handle *handle)
                pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
                if (pblist)
                        copy_page_backup_list(pblist, p);
-                free_pagedir(p);
+                free_pagedir(p, 0);
                if (!pblist)
                        error = -ENOMEM;
        }
diff --git a/kernel/printk.c b/kernel/printk.c
index 8cc19431e7..c056f33244 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -360,8 +360,7 @@ static void call_console_drivers(unsigned long start, unsigned long end)
        unsigned long cur_index, start_print;
        static int msg_level = -1;
-        if (((long)(start - end)) > 0)
+        BUG_ON(((long)(start - end)) > 0);
-                BUG();
        cur_index = start;
        start_print = start;
@@ -708,8 +707,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
 */
 void acquire_console_sem(void)
 {
-        if (in_interrupt())
+        BUG_ON(in_interrupt());
-                BUG();
        down(&console_sem);
        console_locked = 1;
        console_may_schedule = 1;
diff --git a/kernel/profile.c b/kernel/profile.c
index ad81f799a9..68afe121e5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -87,72 +87,52 @@ void __init profile_init(void)
 
 #ifdef CONFIG_PROFILING
 
-static DECLARE_RWSEM(profile_rwsem);
+static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
-static DEFINE_RWLOCK(handoff_lock);
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
-static struct notifier_block * task_exit_notifier;
+static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
-static struct notifier_block * task_free_notifier;
-static struct notifier_block * munmap_notifier;
 
 void profile_task_exit(struct task_struct * task)
 {
-        down_read(&profile_rwsem);
+        blocking_notifier_call_chain(&task_exit_notifier, 0, task);
-        notifier_call_chain(&task_exit_notifier, 0, task);
-        up_read(&profile_rwsem);
 }
 
 int profile_handoff_task(struct task_struct * task)
 {
        int ret;
-        read_lock(&handoff_lock);
+        ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
-        ret = notifier_call_chain(&task_free_notifier, 0, task);
-        read_unlock(&handoff_lock);
        return (ret == NOTIFY_OK) ? 1 : 0;
 }
 void profile_munmap(unsigned long addr)
 {
-        down_read(&profile_rwsem);
+        blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
-        notifier_call_chain(&munmap_notifier, 0, (void *)addr);
-        up_read(&profile_rwsem);
 }
 int task_handoff_register(struct notifier_block * n)
 {
-        int err = -EINVAL;
+        return atomic_notifier_chain_register(&task_free_notifier, n);
-        write_lock(&handoff_lock);
-        err = notifier_chain_register(&task_free_notifier, n);
-        write_unlock(&handoff_lock);
-        return err;
 }
 int task_handoff_unregister(struct notifier_block * n)
 {
-        int err = -EINVAL;
+        return atomic_notifier_chain_unregister(&task_free_notifier, n);
-        write_lock(&handoff_lock);
-        err = notifier_chain_unregister(&task_free_notifier, n);
-        write_unlock(&handoff_lock);
-        return err;
 }
 int profile_event_register(enum profile_type type, struct notifier_block * n)
 {
        int err = -EINVAL;
 
-        down_write(&profile_rwsem);
- 
        switch (type) {
                case PROFILE_TASK_EXIT:
-                        err = notifier_chain_register(&task_exit_notifier, n);
+                        err = blocking_notifier_chain_register(
+                                        &task_exit_notifier, n);
                        break;
                case PROFILE_MUNMAP:
-                        err = notifier_chain_register(&munmap_notifier, n);
+                        err = blocking_notifier_chain_register(
+                                        &munmap_notifier, n);
                        break;
        }
 
-        up_write(&profile_rwsem);
- 
        return err;
 }
@@ -161,18 +141,17 @@ int profile_event_unregister(enum profile_type type, struct notifier_block * n)
 {
        int err = -EINVAL;
 
-        down_write(&profile_rwsem);
- 
        switch (type) {
                case PROFILE_TASK_EXIT:
-                        err = notifier_chain_unregister(&task_exit_notifier, n);
+                        err = blocking_notifier_chain_unregister(
+                                        &task_exit_notifier, n);
                        break;
                case PROFILE_MUNMAP:
-                        err = notifier_chain_unregister(&munmap_notifier, n);
+                        err = blocking_notifier_chain_unregister(
+                                        &munmap_notifier, n);
                        break;
        }
-        up_write(&profile_rwsem);
        return err;
 }
@@ -320,7 +299,7 @@ out:
 }
 #ifdef CONFIG_HOTPLUG_CPU
-static int __devinit profile_cpu_callback(struct notifier_block *info,
+static int profile_cpu_callback(struct notifier_block *info,
                                        unsigned long action, void *__cpu)
 {
        int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d95a72c927..921c22ad16 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -30,14 +30,13 @@
 */
 void __ptrace_link(task_t *child, task_t *new_parent)
 {
-        if (!list_empty(&child->ptrace_list))
+        BUG_ON(!list_empty(&child->ptrace_list));
-                BUG();
        if (child->parent == new_parent)
                return;
        list_add(&child->ptrace_list, &child->parent->ptrace_children);
-        REMOVE_LINKS(child);
+        remove_parent(child);
        child->parent = new_parent;
-        SET_LINKS(child);
+        add_parent(child);
 }
 
 /*
@@ -57,10 +56,6 @@ void ptrace_untrace(task_t *child)
                        signal_wake_up(child, 1);
                }
        }
-        if (child->signal->flags & SIGNAL_GROUP_EXIT) {
-                sigaddset(&child->pending.signal, SIGKILL);
-                signal_wake_up(child, 1);
-        }
        spin_unlock(&child->sighand->siglock);
 }
@@ -77,12 +72,13 @@ void __ptrace_unlink(task_t *child)
        child->ptrace = 0;
        if (!list_empty(&child->ptrace_list)) {
                list_del_init(&child->ptrace_list);
-                REMOVE_LINKS(child);
+                remove_parent(child);
                child->parent = child->real_parent;
-                SET_LINKS(child);
+                add_parent(child);
        }
-        ptrace_untrace(child);
+        if (child->state == TASK_TRACED)
+                ptrace_untrace(child);
 }
 /*
@@ -152,12 +148,34 @@ int ptrace_may_attach(struct task_struct *task)
 int ptrace_attach(struct task_struct *task)
 {
        int retval;
-        task_lock(task);
        retval = -EPERM;
        if (task->pid <= 1)
-                goto bad;
+                goto out;
        if (task->tgid == current->tgid)
-                goto bad;
+                goto out;
+repeat:
+        /*
+         * Nasty, nasty.
+         *
+         * We want to hold both the task-lock and the
+         * tasklist_lock for writing at the same time.
+         * But that's against the rules (tasklist_lock
+         * is taken for reading by interrupts on other
+         * cpu's that may have task_lock).
+         */
+        task_lock(task);
+        local_irq_disable();
+        if (!write_trylock(&tasklist_lock)) {
+                local_irq_enable();
+                task_unlock(task);
+                do {
+                        cpu_relax();
+                } while (!write_can_lock(&tasklist_lock));
+                goto repeat;
+        }
        /* the same process cannot be attached many times */
        if (task->ptrace & PT_PTRACED)
                goto bad;
@@ -170,17 +188,15 @@ int ptrace_attach(struct task_struct *task)
                                      ? PT_ATTACHED : 0);
        if (capable(CAP_SYS_PTRACE))
                task->ptrace |= PT_PTRACE_CAP;
-        task_unlock(task);
-        write_lock_irq(&tasklist_lock);
        __ptrace_link(task, current);
-        write_unlock_irq(&tasklist_lock);
        force_sig_specific(SIGSTOP, task);
-        return 0;
 bad:
+        write_unlock_irq(&tasklist_lock);
        task_unlock(task);
+out:
        return retval;
 }
@@ -421,21 +437,22 @@ int ptrace_request(struct task_struct *child, long request,
 */
 int ptrace_traceme(void)
 {
-        int ret;
+        int ret = -EPERM;
        /*
         * Are we already being traced?
         */
-        if (current->ptrace & PT_PTRACED)
+        task_lock(current);
-                return -EPERM;
+        if (!(current->ptrace & PT_PTRACED)) {
-        ret = security_ptrace(current->parent, current);
+                ret = security_ptrace(current->parent, current);
-        if (ret)
+                /*
-                return -EPERM;
+                 * Set the ptrace bit in the process ptrace flags.
-        /*
+                 */
-         * Set the ptrace bit in the process ptrace flags.
+                if (!ret)
-         */
+                        current->ptrace |= PT_PTRACED;
-        current->ptrace |= PT_PTRACED;
+        }
-        return 0;
+        task_unlock(current);
+        return ret;
 }
 /**
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 13458bbaa1..2058f88c7b 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -479,12 +479,31 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
        return 0;
 }
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
 int rcu_pending(int cpu)
 {
        return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
                __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
 }
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+        struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+        struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
+        return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+}
 void rcu_check_callbacks(int cpu, int user)
 {
        if (user || 
@@ -520,7 +539,7 @@ static void __devinit rcu_online_cpu(int cpu)
        tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
 }
-static int __devinit rcu_cpu_notify(struct notifier_block *self, 
+static int rcu_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -537,7 +556,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata rcu_nb = {
+static struct notifier_block rcu_nb = {
        .notifier_call  = rcu_cpu_notify,
 };
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b4b362b5ba..8154e7589d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -301,7 +301,7 @@ rcu_torture_printk(char *page)
        long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
        long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
-        for_each_cpu(cpu) {
+        for_each_possible_cpu(cpu) {
                for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
                        pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
                        batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
@@ -535,7 +535,7 @@ rcu_torture_init(void)
        atomic_set(&n_rcu_torture_error, 0);
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
                atomic_set(&rcu_torture_wcount[i], 0);
-        for_each_cpu(cpu) {
+        for_each_possible_cpu(cpu) {
                for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
                        per_cpu(rcu_torture_count, cpu)[i] = 0;
                        per_cpu(rcu_torture_batch, cpu)[i] = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 78acdefecc..c13f1bd2df 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -145,7 +145,8 @@
        (v1) * (v2_max) / (v1_max)
 #define DELTA(p) \
-        (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
+        (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
+                INTERACTIVE_DELTA)
 #define TASK_INTERACTIVE(p) \
        ((p)->prio <= (p)->static_prio - DELTA(p))
@@ -666,9 +667,13 @@ static int effective_prio(task_t *p)
 /*
 * __activate_task - move a task to the runqueue.
 */
-static inline void __activate_task(task_t *p, runqueue_t *rq)
+static void __activate_task(task_t *p, runqueue_t *rq)
 {
-        enqueue_task(p, rq->active);
+        prio_array_t *target = rq->active;
+        if (batch_task(p))
+                target = rq->expired;
+        enqueue_task(p, target);
        rq->nr_running++;
 }
@@ -687,7 +692,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
        unsigned long long __sleep_time = now - p->timestamp;
        unsigned long sleep_time;
-        if (unlikely(p->policy == SCHED_BATCH))
+        if (batch_task(p))
                sleep_time = 0;
        else {
                if (__sleep_time > NS_MAX_SLEEP_AVG)
@@ -699,21 +704,25 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
        if (likely(sleep_time > 0)) {
                /*
                 * User tasks that sleep a long time are categorised as
-                 * idle and will get just interactive status to stay active &
+                 * idle. They will only have their sleep_avg increased to a
-                 * prevent them suddenly becoming cpu hogs and starving
+                 * level that makes them just interactive priority to stay
-                 * other processes.
+                 * active yet prevent them suddenly becoming cpu hogs and
+                 * starving other processes.
                 */
-                if (p->mm && p->activated != -1 &&
+                if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
-                        sleep_time > INTERACTIVE_SLEEP(p)) {
+                                unsigned long ceiling;
-                                p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
-                                                DEF_TIMESLICE);
+                                ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
+                                        DEF_TIMESLICE);
+                                if (p->sleep_avg < ceiling)
+                                        p->sleep_avg = ceiling;
                } else {
                        /*
                         * Tasks waking from uninterruptible sleep are
                         * limited in their sleep_avg rise as they
                         * are likely to be waiting on I/O
                         */
-                        if (p->activated == -1 && p->mm) {
+                        if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
                                if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
                                        sleep_time = 0;
                                else if (p->sleep_avg + sleep_time >=
@@ -768,7 +777,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
         * This checks to make sure it's not an uninterruptible task
         * that is now waking up.
         */
-        if (!p->activated) {
+        if (p->sleep_type == SLEEP_NORMAL) {
                /*
                 * Tasks which were woken up by interrupts (ie. hw events)
                 * are most likely of interactive nature. So we give them
@@ -777,13 +786,13 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
                 * on a CPU, first time around:
                 */
                if (in_interrupt())
-                        p->activated = 2;
+                        p->sleep_type = SLEEP_INTERRUPTED;
                else {
                        /*
                         * Normal first-time wakeups get a credit too for
                         * on-runqueue time, but it will be weighted down:
                         */
-                        p->activated = 1;
+                        p->sleep_type = SLEEP_INTERACTIVE;
                }
        }
        p->timestamp = now;
@@ -1271,19 +1280,19 @@ out_activate:
                 * Tasks on involuntary sleep don't earn
                 * sleep_avg beyond just interactive state.
                 */
-                p->activated = -1;
+                p->sleep_type = SLEEP_NONINTERACTIVE;
-        }
+        } else
        /*
         * Tasks that have marked their sleep as noninteractive get
-         * woken up without updating their sleep average. (i.e. their
+         * woken up with their sleep average not weighted in an
-         * sleep is handled in a priority-neutral manner, no priority
+         * interactive way.
-         * boost and no penalty.)
         */
-        if (old_state & TASK_NONINTERACTIVE)
+                if (old_state & TASK_NONINTERACTIVE)
-                __activate_task(p, rq);
+                        p->sleep_type = SLEEP_NONINTERACTIVE;
-        else
-                activate_task(p, rq, cpu == this_cpu);
+        activate_task(p, rq, cpu == this_cpu);
        /*
         * Sync wakeups (i.e. those types of wakeups where the waker
         * has indicated that it will leave the CPU in short order)
@@ -1624,7 +1633,7 @@ unsigned long nr_uninterruptible(void)
 {
        unsigned long i, sum = 0;
-        for_each_cpu(i)
+        for_each_possible_cpu(i)
                sum += cpu_rq(i)->nr_uninterruptible;
        /*
@@ -1641,7 +1650,7 @@ unsigned long long nr_context_switches(void)
 {
        unsigned long long i, sum = 0;
-        for_each_cpu(i)
+        for_each_possible_cpu(i)
                sum += cpu_rq(i)->nr_switches;
        return sum;
@@ -1651,12 +1660,27 @@ unsigned long nr_iowait(void)
 {
        unsigned long i, sum = 0;
-        for_each_cpu(i)
+        for_each_possible_cpu(i)
                sum += atomic_read(&cpu_rq(i)->nr_iowait);
        return sum;
 }
+unsigned long nr_active(void)
+{
+        unsigned long i, running = 0, uninterruptible = 0;
+        for_each_online_cpu(i) {
+                running += cpu_rq(i)->nr_running;
+                uninterruptible += cpu_rq(i)->nr_uninterruptible;
+        }
+        if (unlikely((long)uninterruptible < 0))
+                uninterruptible = 0;
+        return running + uninterruptible;
+}
 #ifdef CONFIG_SMP
 /*
@@ -2859,6 +2883,12 @@ EXPORT_SYMBOL(sub_preempt_count);
 #endif
+static inline int interactive_sleep(enum sleep_type sleep_type)
+{
+        return (sleep_type == SLEEP_INTERACTIVE ||
+                sleep_type == SLEEP_INTERRUPTED);
+}
 /*
 * schedule() is the main scheduler function.
 */
@@ -2878,13 +2908,11 @@ asmlinkage void __sched schedule(void)
         * schedule() atomically, we ignore that path for now.
         * Otherwise, whine if we are scheduling when we should not be.
         */
-        if (likely(!current->exit_state)) {
+        if (unlikely(in_atomic() && !current->exit_state)) {
-                if (unlikely(in_atomic())) {
+                printk(KERN_ERR "BUG: scheduling while atomic: "
-                        printk(KERN_ERR "BUG: scheduling while atomic: "
+                        "%s/0x%08x/%d\n",
-                                "%s/0x%08x/%d\n",
+                        current->comm, preempt_count(), current->pid);
-                                current->comm, preempt_count(), current->pid);
+                dump_stack();
-                        dump_stack();
-                }
        }
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -2984,12 +3012,12 @@ go_idle:
        queue = array->queue + idx;
        next = list_entry(queue->next, task_t, run_list);
-        if (!rt_task(next) && next->activated > 0) {
+        if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
                unsigned long long delta = now - next->timestamp;
                if (unlikely((long long)(now - next->timestamp) < 0))
                        delta = 0;
-                if (next->activated == 1)
+                if (next->sleep_type == SLEEP_INTERACTIVE)
                        delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
                array = next->array;
@@ -2999,10 +3027,9 @@ go_idle:
                        dequeue_task(next, array);
                        next->prio = new_prio;
                        enqueue_task(next, array);
-                } else
+                }
-                        requeue_task(next, array);
        }
-        next->activated = 0;
+        next->sleep_type = SLEEP_NORMAL;
 switch_tasks:
        if (next == rq->idle)
                schedstat_inc(rq, sched_goidle);
@@ -4761,7 +4788,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
 /* Register at highest priority so that task migration (migrate_all_tasks)
 * happens before everything else.
 */
-static struct notifier_block __devinitdata migration_notifier = {
+static struct notifier_block migration_notifier = {
        .notifier_call = migration_call,
        .priority = 10
 };
@@ -5575,11 +5602,31 @@ static int cpu_to_cpu_group(int cpu)
 }
 #endif
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group sched_group_core[NR_CPUS];
+#endif
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+        return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+        return cpu;
+}
+#endif
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
 static int cpu_to_phys_group(int cpu)
 {
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_MC)
+        cpumask_t mask = cpu_coregroup_map(cpu);
+        return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
        return first_cpu(cpu_sibling_map[cpu]);
 #else
        return cpu;
@@ -5602,6 +5649,32 @@ static int cpu_to_allnodes_group(int cpu)
 {
        return cpu_to_node(cpu);
 }
+static void init_numa_sched_groups_power(struct sched_group *group_head)
+{
+        struct sched_group *sg = group_head;
+        int j;
+        if (!sg)
+                return;
+next_sg:
+        for_each_cpu_mask(j, sg->cpumask) {
+                struct sched_domain *sd;
+                sd = &per_cpu(phys_domains, j);
+                if (j != first_cpu(sd->groups->cpumask)) {
+                        /*
+                         * Only add "power" once for each
+                         * physical package.
+                         */
+                        continue;
+                }
+                sg->cpu_power += sd->groups->cpu_power;
+        }
+        sg = sg->next;
+        if (sg != group_head)
+                goto next_sg;
+}
 #endif
 /*
@@ -5677,6 +5750,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
                sd->parent = p;
                sd->groups = &sched_group_phys[group];
+#ifdef CONFIG_SCHED_MC
+                p = sd;
+                sd = &per_cpu(core_domains, i);
+                group = cpu_to_core_group(i);
+                *sd = SD_MC_INIT;
+                sd->span = cpu_coregroup_map(i);
+                cpus_and(sd->span, sd->span, *cpu_map);
+                sd->parent = p;
+                sd->groups = &sched_group_core[group];
+#endif
 #ifdef CONFIG_SCHED_SMT
                p = sd;
                sd = &per_cpu(cpu_domains, i);
@@ -5702,6 +5786,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
        }
 #endif
+#ifdef CONFIG_SCHED_MC
+        /* Set up multi-core groups */
+        for_each_cpu_mask(i, *cpu_map) {
+                cpumask_t this_core_map = cpu_coregroup_map(i);
+                cpus_and(this_core_map, this_core_map, *cpu_map);
+                if (i != first_cpu(this_core_map))
+                        continue;
+                init_sched_build_groups(sched_group_core, this_core_map,
+                                        &cpu_to_core_group);
+        }
+#endif
        /* Set up physical groups */
        for (i = 0; i < MAX_NUMNODES; i++) {
                cpumask_t nodemask = node_to_cpumask(i);
@@ -5798,51 +5895,38 @@ void build_sched_domains(const cpumask_t *cpu_map)
                power = SCHED_LOAD_SCALE;
                sd->groups->cpu_power = power;
 #endif
+#ifdef CONFIG_SCHED_MC
+                sd = &per_cpu(core_domains, i);
+                power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+                                            * SCHED_LOAD_SCALE / 10;
+                sd->groups->cpu_power = power;
                sd = &per_cpu(phys_domains, i);
+                /*
+                 * This has to be < 2 * SCHED_LOAD_SCALE
+                 * Lets keep it SCHED_LOAD_SCALE, so that
+                 * while calculating NUMA group's cpu_power
+                 * we can simply do
+                 *  numa_group->cpu_power += phys_group->cpu_power;
+                 *
+                 * See "only add power once for each physical pkg"
+                 * comment below
+                 */
+                sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
+                sd = &per_cpu(phys_domains, i);
                power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
                                (cpus_weight(sd->groups->cpumask)-1) / 10;
                sd->groups->cpu_power = power;
-#ifdef CONFIG_NUMA
-                sd = &per_cpu(allnodes_domains, i);
-                if (sd->groups) {
-                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                                (cpus_weight(sd->groups->cpumask)-1) / 10;
-                        sd->groups->cpu_power = power;
-                }
 #endif
        }
 #ifdef CONFIG_NUMA
-        for (i = 0; i < MAX_NUMNODES; i++) {
+        for (i = 0; i < MAX_NUMNODES; i++)
-                struct sched_group *sg = sched_group_nodes[i];
+                init_numa_sched_groups_power(sched_group_nodes[i]);
-                int j;
-                if (sg == NULL)
+        init_numa_sched_groups_power(sched_group_allnodes);
-                        continue;
-next_sg:
-                for_each_cpu_mask(j, sg->cpumask) {
-                        struct sched_domain *sd;
-                        int power;
-                        sd = &per_cpu(phys_domains, j);
-                        if (j != first_cpu(sd->groups->cpumask)) {
-                                /*
-                                 * Only add "power" once for each
-                                 * physical package.
-                                 */
-                                continue;
-                        }
-                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                                (cpus_weight(sd->groups->cpumask)-1) / 10;
-                        sg->cpu_power += power;
-                }
-                sg = sg->next;
-                if (sg != sched_group_nodes[i])
-                        goto next_sg;
-        }
 #endif
        /* Attach the domains */
@@ -5850,6 +5934,8 @@ next_sg:
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+                sd = &per_cpu(core_domains, i);
 #else
                sd = &per_cpu(phys_domains, i);
 #endif
@@ -6022,7 +6108,7 @@ void __init sched_init(void)
        runqueue_t *rq;
        int i, j, k;
-        for_each_cpu(i) {
+        for_each_possible_cpu(i) {
                prio_array_t *array;
                rq = cpu_rq(i);
diff --git a/kernel/signal.c b/kernel/signal.c
index 75f7341b0c..e5f8aea78f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,7 +22,6 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/ptrace.h>
-#include <linux/posix-timers.h>
 #include <linux/signal.h>
 #include <linux/audit.h>
 #include <linux/capability.h>
@@ -147,6 +146,8 @@ static kmem_cache_t *sigqueue_cachep;
 #define sig_kernel_stop(sig) \
                (((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_STOP_MASK))
+#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
 #define sig_user_defined(t, signr) \
        (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&  \
         ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
@@ -292,7 +293,7 @@ static void __sigqueue_free(struct sigqueue *q)
        kmem_cache_free(sigqueue_cachep, q);
 }
-static void flush_sigqueue(struct sigpending *queue)
+void flush_sigqueue(struct sigpending *queue)
 {
        struct sigqueue *q;
@@ -307,9 +308,7 @@ static void flush_sigqueue(struct sigpending *queue)
 /*
 * Flush all pending signals for a task.
 */
+void flush_signals(struct task_struct *t)
-void
-flush_signals(struct task_struct *t)
 {
        unsigned long flags;
@@ -321,109 +320,6 @@ flush_signals(struct task_struct *t)
 }
 /*
- * This function expects the tasklist_lock write-locked.
- */
-void __exit_sighand(struct task_struct *tsk)
-{
-        struct sighand_struct * sighand = tsk->sighand;
-        /* Ok, we're done with the signal handlers */
-        tsk->sighand = NULL;
-        if (atomic_dec_and_test(&sighand->count))
-                sighand_free(sighand);
-}
-void exit_sighand(struct task_struct *tsk)
-{
-        write_lock_irq(&tasklist_lock);
-        rcu_read_lock();
-        if (tsk->sighand != NULL) {
-                struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
-                spin_lock(&sighand->siglock);
-                __exit_sighand(tsk);
-                spin_unlock(&sighand->siglock);
-        }
-        rcu_read_unlock();
-        write_unlock_irq(&tasklist_lock);
-}
-/*
- * This function expects the tasklist_lock write-locked.
- */
-void __exit_signal(struct task_struct *tsk)
-{
-        struct signal_struct * sig = tsk->signal;
-        struct sighand_struct * sighand;
-        if (!sig)
-                BUG();
-        if (!atomic_read(&sig->count))
-                BUG();
-        rcu_read_lock();
-        sighand = rcu_dereference(tsk->sighand);
-        spin_lock(&sighand->siglock);
-        posix_cpu_timers_exit(tsk);
-        if (atomic_dec_and_test(&sig->count)) {
-                posix_cpu_timers_exit_group(tsk);
-                tsk->signal = NULL;
-                __exit_sighand(tsk);
-                spin_unlock(&sighand->siglock);
-                flush_sigqueue(&sig->shared_pending);
-        } else {
-                /*
-                 * If there is any task waiting for the group exit
-                 * then notify it:
-                 */
-                if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
-                        wake_up_process(sig->group_exit_task);
-                        sig->group_exit_task = NULL;
-                }
-                if (tsk == sig->curr_target)
-                        sig->curr_target = next_thread(tsk);
-                tsk->signal = NULL;
-                /*
-                 * Accumulate here the counters for all threads but the
-                 * group leader as they die, so they can be added into
-                 * the process-wide totals when those are taken.
-                 * The group leader stays around as a zombie as long
-                 * as there are other threads.  When it gets reaped,
-                 * the exit.c code will add its counts into these totals.
-                 * We won't ever get here for the group leader, since it
-                 * will have been the last reference on the signal_struct.
-                 */
-                sig->utime = cputime_add(sig->utime, tsk->utime);
-                sig->stime = cputime_add(sig->stime, tsk->stime);
-                sig->min_flt += tsk->min_flt;
-                sig->maj_flt += tsk->maj_flt;
-                sig->nvcsw += tsk->nvcsw;
-                sig->nivcsw += tsk->nivcsw;
-                sig->sched_time += tsk->sched_time;
-                __exit_sighand(tsk);
-                spin_unlock(&sighand->siglock);
-                sig = NULL;     /* Marker for below.  */
-        }
-        rcu_read_unlock();
-        clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
-        flush_sigqueue(&tsk->pending);
-        if (sig) {
-                /*
-                 * We are cleaning up the signal_struct here.
-                 */
-                exit_thread_group_keys(sig);
-                kmem_cache_free(signal_cachep, sig);
-        }
-}
-void exit_signal(struct task_struct *tsk)
-{
-        atomic_dec(&tsk->signal->live);
-        write_lock_irq(&tasklist_lock);
-        __exit_signal(tsk);
-        write_unlock_irq(&tasklist_lock);
-}
-/*
 * Flush all handlers for a task.
 */
@@ -695,9 +591,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 }
 /* forward decl */
-static void do_notify_parent_cldstop(struct task_struct *tsk,
+static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
-                                     int to_self,
-                                     int why);
 /*
 * Handle magic process-wide effects of stop/continue signals.
@@ -747,7 +641,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
                        p->signal->group_stop_count = 0;
                        p->signal->flags = SIGNAL_STOP_CONTINUED;
                        spin_unlock(&p->sighand->siglock);
-                        do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED);
+                        do_notify_parent_cldstop(p, CLD_STOPPED);
                        spin_lock(&p->sighand->siglock);
                }
                rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -788,7 +682,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
                        p->signal->flags = SIGNAL_STOP_CONTINUED;
                        p->signal->group_exit_code = 0;
                        spin_unlock(&p->sighand->siglock);
-                        do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED);
+                        do_notify_parent_cldstop(p, CLD_CONTINUED);
                        spin_lock(&p->sighand->siglock);
                } else {
                        /*
@@ -875,8 +769,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
        int ret = 0;
-        if (!irqs_disabled())
+        BUG_ON(!irqs_disabled());
-                BUG();
        assert_spin_locked(&t->sighand->siglock);
        /* Short-circuit ignored signals.  */
@@ -975,7 +868,6 @@ __group_complete_signal(int sig, struct task_struct *p)
                if (t == NULL)
                        /* restart balancing at this thread */
                        t = p->signal->curr_target = p;
-                BUG_ON(t->tgid != p->tgid);
                while (!wants_signal(sig, t)) {
                        t = next_thread(t);
@@ -1120,27 +1012,37 @@ void zap_other_threads(struct task_struct *p)
 /*
 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
 */
+struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
+{
+        struct sighand_struct *sighand;
+        for (;;) {
+                sighand = rcu_dereference(tsk->sighand);
+                if (unlikely(sighand == NULL))
+                        break;
+                spin_lock_irqsave(&sighand->siglock, *flags);
+                if (likely(sighand == tsk->sighand))
+                        break;
+                spin_unlock_irqrestore(&sighand->siglock, *flags);
+        }
+        return sighand;
+}
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
        unsigned long flags;
-        struct sighand_struct *sp;
        int ret;
-retry:
        ret = check_kill_permission(sig, info, p);
-        if (!ret && sig && (sp = rcu_dereference(p->sighand))) {
-                spin_lock_irqsave(&sp->siglock, flags);
+        if (!ret && sig) {
-                if (p->sighand != sp) {
+                ret = -ESRCH;
-                        spin_unlock_irqrestore(&sp->siglock, flags);
+                if (lock_task_sighand(p, &flags)) {
-                        goto retry;
+                        ret = __group_send_sig_info(sig, info, p);
-                }
+                        unlock_task_sighand(p, &flags);
-                if ((atomic_read(&sp->count) == 0) ||
-                                (atomic_read(&p->usage) == 0)) {
-                        spin_unlock_irqrestore(&sp->siglock, flags);
-                        return -ESRCH;
                }
-                ret = __group_send_sig_info(sig, info, p);
-                spin_unlock_irqrestore(&sp->siglock, flags);
        }
        return ret;
@@ -1189,7 +1091,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
        struct task_struct *p;
        rcu_read_lock();
-        if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
+        if (unlikely(sig_needs_tasklist(sig))) {
                read_lock(&tasklist_lock);
                acquired_tasklist_lock = 1;
        }
@@ -1405,12 +1307,10 @@ void sigqueue_free(struct sigqueue *q)
        __sigqueue_free(q);
 }
-int
+int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
-send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 {
        unsigned long flags;
        int ret = 0;
-        struct sighand_struct *sh;
        BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
@@ -1424,48 +1324,17 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
         */
        rcu_read_lock();
-        if (unlikely(p->flags & PF_EXITING)) {
+        if (!likely(lock_task_sighand(p, &flags))) {
                ret = -1;
                goto out_err;
        }
-retry:
-        sh = rcu_dereference(p->sighand);
-        spin_lock_irqsave(&sh->siglock, flags);
-        if (p->sighand != sh) {
-                /* We raced with exec() in a multithreaded process... */
-                spin_unlock_irqrestore(&sh->siglock, flags);
-                goto retry;
-        }
-        /*
-         * We do the check here again to handle the following scenario:
-         *
-         * CPU 0                CPU 1
-         * send_sigqueue
-         * check PF_EXITING
-         * interrupt            exit code running
-         *                      __exit_signal
-         *                      lock sighand->siglock
-         *                      unlock sighand->siglock
-         * lock sh->siglock
-         * add(tsk->pending)    flush_sigqueue(tsk->pending)
-         *
-         */
-        if (unlikely(p->flags & PF_EXITING)) {
-                ret = -1;
-                goto out;
-        }
        if (unlikely(!list_empty(&q->list))) {
                /*
                 * If an SI_TIMER entry is already queue just increment
                 * the overrun count.
                 */
-                if (q->info.si_code != SI_TIMER)
+                BUG_ON(q->info.si_code != SI_TIMER);
-                        BUG();
                q->info.si_overrun++;
                goto out;
        }
@@ -1481,7 +1350,7 @@ retry:
                signal_wake_up(p, sig == SIGKILL);
 out:
-        spin_unlock_irqrestore(&sh->siglock, flags);
+        unlock_task_sighand(p, &flags);
 out_err:
        rcu_read_unlock();
@@ -1513,8 +1382,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
                 * the overrun count.  Other uses should not try to
                 * send the signal multiple times.
                 */
-                if (q->info.si_code != SI_TIMER)
+                BUG_ON(q->info.si_code != SI_TIMER);
-                        BUG();
                q->info.si_overrun++;
                goto out;
        } 
@@ -1613,14 +1481,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
        spin_unlock_irqrestore(&psig->siglock, flags);
 }
-static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why)
+static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 {
        struct siginfo info;
        unsigned long flags;
        struct task_struct *parent;
        struct sighand_struct *sighand;
-        if (to_self)
+        if (tsk->ptrace & PT_PTRACED)
                parent = tsk->parent;
        else {
                tsk = tsk->group_leader;
@@ -1689,13 +1557,14 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
        /* Let the debugger run.  */
        set_current_state(TASK_TRACED);
        spin_unlock_irq(&current->sighand->siglock);
+        try_to_freeze();
        read_lock(&tasklist_lock);
        if (likely(current->ptrace & PT_PTRACED) &&
            likely(current->parent != current->real_parent ||
                   !(current->ptrace & PT_ATTACHED)) &&
            (likely(current->parent->signal != current->signal) ||
             !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
-                do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
+                do_notify_parent_cldstop(current, CLD_TRAPPED);
                read_unlock(&tasklist_lock);
                schedule();
        } else {
@@ -1744,25 +1613,17 @@ void ptrace_notify(int exit_code)
 static void
 finish_stop(int stop_count)
 {
-        int to_self;
        /*
         * If there are no other threads in the group, or if there is
         * a group stop in progress and we are the last to stop,
         * report to the parent.  When ptraced, every thread reports itself.
         */
-        if (stop_count < 0 || (current->ptrace & PT_PTRACED))
+        if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
-                to_self = 1;
+                read_lock(&tasklist_lock);
-        else if (stop_count == 0)
+                do_notify_parent_cldstop(current, CLD_STOPPED);
-                to_self = 0;
+                read_unlock(&tasklist_lock);
-        else
+        }
-                goto out;
-        read_lock(&tasklist_lock);
-        do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
-        read_unlock(&tasklist_lock);
-out:
        schedule();
        /*
         * Now we don't run again until continued.
@@ -1776,12 +1637,10 @@ out:
 * Returns nonzero if we've actually stopped and released the siglock.
 * Returns zero if we didn't stop and still hold the siglock.
 */
-static int
+static int do_signal_stop(int signr)
-do_signal_stop(int signr)
 {
        struct signal_struct *sig = current->signal;
-        struct sighand_struct *sighand = current->sighand;
+        int stop_count;
-        int stop_count = -1;
        if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
                return 0;
@@ -1791,86 +1650,37 @@ do_signal_stop(int signr)
                 * There is a group stop in progress.  We don't need to
                 * start another one.
                 */
-                signr = sig->group_exit_code;
                stop_count = --sig->group_stop_count;
-                current->exit_code = signr;
+        } else {
-                set_current_state(TASK_STOPPED);
-                if (stop_count == 0)
-                        sig->flags = SIGNAL_STOP_STOPPED;
-                spin_unlock_irq(&sighand->siglock);
-        }
-        else if (thread_group_empty(current)) {
-                /*
-                 * Lock must be held through transition to stopped state.
-                 */
-                current->exit_code = current->signal->group_exit_code = signr;
-                set_current_state(TASK_STOPPED);
-                sig->flags = SIGNAL_STOP_STOPPED;
-                spin_unlock_irq(&sighand->siglock);
-        }
-        else {
                /*
                 * There is no group stop already in progress.
-                 * We must initiate one now, but that requires
+                 * We must initiate one now.
-                 * dropping siglock to get both the tasklist lock
-                 * and siglock again in the proper order.  Note that
-                 * this allows an intervening SIGCONT to be posted.
-                 * We need to check for that and bail out if necessary.
                 */
                struct task_struct *t;
-                spin_unlock_irq(&sighand->siglock);
+                sig->group_exit_code = signr;
-                /* signals can be posted during this window */
+                stop_count = 0;
+                for (t = next_thread(current); t != current; t = next_thread(t))
-                read_lock(&tasklist_lock);
-                spin_lock_irq(&sighand->siglock);
-                if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) {
                        /*
-                         * Another stop or continue happened while we
+                         * Setting state to TASK_STOPPED for a group
-                         * didn't have the lock.  We can just swallow this
+                         * stop is always done with the siglock held,
-                         * signal now.  If we raced with a SIGCONT, that
+                         * so this check has no races.
-                         * should have just cleared it now.  If we raced
-                         * with another processor delivering a stop signal,
-                         * then the SIGCONT that wakes us up should clear it.
                         */
-                        read_unlock(&tasklist_lock);
+                        if (!t->exit_state &&
-                        return 0;
+                            !(t->state & (TASK_STOPPED|TASK_TRACED))) {
-                }
+                                stop_count++;
+                                signal_wake_up(t, 0);
-                if (sig->group_stop_count == 0) {
+                        }
-                        sig->group_exit_code = signr;
+                sig->group_stop_count = stop_count;
-                        stop_count = 0;
-                        for (t = next_thread(current); t != current;
-                             t = next_thread(t))
-                                /*
-                                 * Setting state to TASK_STOPPED for a group
-                                 * stop is always done with the siglock held,
-                                 * so this check has no races.
-                                 */
-                                if (!t->exit_state &&
-                                    !(t->state & (TASK_STOPPED|TASK_TRACED))) {
-                                        stop_count++;
-                                        signal_wake_up(t, 0);
-                                }
-                        sig->group_stop_count = stop_count;
-                }
-                else {
-                        /* A race with another thread while unlocked.  */
-                        signr = sig->group_exit_code;
-                        stop_count = --sig->group_stop_count;
-                }
-                current->exit_code = signr;
-                set_current_state(TASK_STOPPED);
-                if (stop_count == 0)
-                        sig->flags = SIGNAL_STOP_STOPPED;
-                spin_unlock_irq(&sighand->siglock);
-                read_unlock(&tasklist_lock);
        }
+        if (stop_count == 0)
+                sig->flags = SIGNAL_STOP_STOPPED;
+        current->exit_code = sig->group_exit_code;
+        __set_current_state(TASK_STOPPED);
+        spin_unlock_irq(&current->sighand->siglock);
        finish_stop(stop_count);
        return 1;
 }
@@ -1944,9 +1754,9 @@ relock:
                        /* Let the debugger run.  */
                        ptrace_stop(signr, signr, info);
-                        /* We're back.  Did the debugger cancel the sig or group_exit? */
+                        /* We're back.  Did the debugger cancel the sig?  */
                        signr = current->exit_code;
-                        if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT)
+                        if (signr == 0)
                                continue;
                        current->exit_code = 0;
@@ -1990,7 +1800,7 @@ relock:
                        continue;
                /* Init gets no signals it doesn't want.  */
-                if (current->pid == 1)
+                if (current == child_reaper)
                        continue;
                if (sig_kernel_stop(signr)) {
@@ -2430,8 +2240,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
        return kill_proc_info(sig, &info, pid);
 }
-int
+int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
-do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
        struct k_sigaction *k;
        sigset_t mask;
@@ -2457,6 +2266,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
        if (act) {
                sigdelsetmask(&act->sa.sa_mask,
                              sigmask(SIGKILL) | sigmask(SIGSTOP));
+                *k = *act;
                /*
                 * POSIX 3.3.1.3:
                 *  "Setting a signal action to SIG_IGN for a signal that is
@@ -2469,19 +2279,8 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
                 *   be discarded, whether or not it is blocked"
                 */
                if (act->sa.sa_handler == SIG_IGN ||
-                    (act->sa.sa_handler == SIG_DFL &&
+                   (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) {
-                     sig_kernel_ignore(sig))) {
-                        /*
-                         * This is a fairly rare case, so we only take the
-                         * tasklist_lock once we're sure we'll need it.
-                         * Now we must do this little unlock and relock
-                         * dance to maintain the lock hierarchy.
-                         */
                        struct task_struct *t = current;
-                        spin_unlock_irq(&t->sighand->siglock);
-                        read_lock(&tasklist_lock);
-                        spin_lock_irq(&t->sighand->siglock);
-                        *k = *act;
                        sigemptyset(&mask);
                        sigaddset(&mask, sig);
                        rm_from_queue_full(&mask, &t->signal->shared_pending);
@@ -2490,12 +2289,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
                                recalc_sigpending_tsk(t);
                                t = next_thread(t);
                        } while (t != current);
-                        spin_unlock_irq(&current->sighand->siglock);
-                        read_unlock(&tasklist_lock);
-                        return 0;
                }
-                *k = *act;
        }
        spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ec8fed42a8..336f92d64e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit cpu_callback(struct notifier_block *nfb,
+static int cpu_callback(struct notifier_block *nfb,
                                  unsigned long action,
                                  void *hcpu)
 {
@@ -484,7 +484,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block cpu_nfb = {
        .notifier_call = cpu_callback
 };
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d9b3d5847e..14c7faf029 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
 /*
 * Create/destroy watchdog threads as CPUs come and go:
 */
-static int __devinit
+static int
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int hotcpu = (unsigned long)hcpu;
@@ -140,7 +140,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block cpu_nfb = {
        .notifier_call = cpu_callback
 };
@@ -152,5 +152,5 @@ __init void spawn_softlockup_task(void)
        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
        register_cpu_notifier(&cpu_nfb);
-        notifier_chain_register(&panic_notifier_list, &panic_block);
+        atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index 38bc73ede2..0b6ec0e793 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -95,99 +95,304 @@ int cad_pid = 1;
 *      and the like. 
 */
-static struct notifier_block *reboot_notifier_list;
+static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
-static DEFINE_RWLOCK(notifier_lock);
+/*
+ *      Notifier chain core routines.  The exported routines below
+ *      are layered on top of these, with appropriate locking added.
+ */
+static int notifier_chain_register(struct notifier_block **nl,
+                struct notifier_block *n)
+{
+        while ((*nl) != NULL) {
+                if (n->priority > (*nl)->priority)
+                        break;
+                nl = &((*nl)->next);
+        }
+        n->next = *nl;
+        rcu_assign_pointer(*nl, n);
+        return 0;
+}
+static int notifier_chain_unregister(struct notifier_block **nl,
+                struct notifier_block *n)
+{
+        while ((*nl) != NULL) {
+                if ((*nl) == n) {
+                        rcu_assign_pointer(*nl, n->next);
+                        return 0;
+                }
+                nl = &((*nl)->next);
+        }
+        return -ENOENT;
+}
+static int __kprobes notifier_call_chain(struct notifier_block **nl,
+                unsigned long val, void *v)
+{
+        int ret = NOTIFY_DONE;
+        struct notifier_block *nb;
+        nb = rcu_dereference(*nl);
+        while (nb) {
+                ret = nb->notifier_call(nb, val, v);
+                if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+                        break;
+                nb = rcu_dereference(nb->next);
+        }
+        return ret;
+}
+/*
+ *      Atomic notifier chain routines.  Registration and unregistration
+ *      use a mutex, and call_chain is synchronized by RCU (no locks).
+ */
 /**
- *      notifier_chain_register - Add notifier to a notifier chain
+ *      atomic_notifier_chain_register - Add notifier to an atomic notifier chain
- *      @list: Pointer to root list pointer
+ *      @nh: Pointer to head of the atomic notifier chain
 *      @n: New entry in notifier chain
 *
- *      Adds a notifier to a notifier chain.
+ *      Adds a notifier to an atomic notifier chain.
 *
 *      Currently always returns zero.
 */
+int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+                struct notifier_block *n)
+{
+        unsigned long flags;
+        int ret;
+        spin_lock_irqsave(&nh->lock, flags);
+        ret = notifier_chain_register(&nh->head, n);
+        spin_unlock_irqrestore(&nh->lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
+/**
+ *      atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
+ *      @nh: Pointer to head of the atomic notifier chain
+ *      @n: Entry to remove from notifier chain
+ *
+ *      Removes a notifier from an atomic notifier chain.
+ *
+ *      Returns zero on success or %-ENOENT on failure.
+ */
+int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+                struct notifier_block *n)
+{
+        unsigned long flags;
+        int ret;
+        spin_lock_irqsave(&nh->lock, flags);
+        ret = notifier_chain_unregister(&nh->head, n);
+        spin_unlock_irqrestore(&nh->lock, flags);
+        synchronize_rcu();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
+/**
+ *      atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ *      @nh: Pointer to head of the atomic notifier chain
+ *      @val: Value passed unmodified to notifier function
+ *      @v: Pointer passed unmodified to notifier function
+ *
+ *      Calls each function in a notifier chain in turn.  The functions
+ *      run in an atomic context, so they must not block.
+ *      This routine uses RCU to synchronize with changes to the chain.
+ *
+ *      If the return value of the notifier can be and'ed
+ *      with %NOTIFY_STOP_MASK then atomic_notifier_call_chain
+ *      will return immediately, with the return value of
+ *      the notifier function which halted execution.
+ *      Otherwise the return value is the return value
+ *      of the last notifier function called.
+ */
 
-int notifier_chain_register(struct notifier_block **list, struct notifier_block *n)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+                unsigned long val, void *v)
 {
-        write_lock(&notifier_lock);
+        int ret;
-        while(*list)
-        {
+        rcu_read_lock();
-                if(n->priority > (*list)->priority)
+        ret = notifier_call_chain(&nh->head, val, v);
-                        break;
+        rcu_read_unlock();
-                list= &((*list)->next);
+        return ret;
-        }
-        n->next = *list;
-        *list=n;
-        write_unlock(&notifier_lock);
-        return 0;
 }
-EXPORT_SYMBOL(notifier_chain_register);
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+/*
+ *      Blocking notifier chain routines.  All access to the chain is
+ *      synchronized by an rwsem.
+ */
 /**
- *      notifier_chain_unregister - Remove notifier from a notifier chain
+ *      blocking_notifier_chain_register - Add notifier to a blocking notifier chain
- *      @nl: Pointer to root list pointer
+ *      @nh: Pointer to head of the blocking notifier chain
 *      @n: New entry in notifier chain
 *
- *      Removes a notifier from a notifier chain.
+ *      Adds a notifier to a blocking notifier chain.
+ *      Must be called in process context.
 *
- *      Returns zero on success, or %-ENOENT on failure.
+ *      Currently always returns zero.
 */
 
-int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n)
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+                struct notifier_block *n)
 {
-        write_lock(&notifier_lock);
+        int ret;
-        while((*nl)!=NULL)
-        {
+        /*
-                if((*nl)==n)
+         * This code gets used during boot-up, when task switching is
-                {
+         * not yet working and interrupts must remain disabled.  At
-                        *nl=n->next;
+         * such times we must not call down_write().
-                        write_unlock(&notifier_lock);
+         */
-                        return 0;
+        if (unlikely(system_state == SYSTEM_BOOTING))
-                }
+                return notifier_chain_register(&nh->head, n);
-                nl=&((*nl)->next);
-        }
+        down_write(&nh->rwsem);
-        write_unlock(&notifier_lock);
+        ret = notifier_chain_register(&nh->head, n);
-        return -ENOENT;
+        up_write(&nh->rwsem);
+        return ret;
 }
-EXPORT_SYMBOL(notifier_chain_unregister);
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
 /**
- *      notifier_call_chain - Call functions in a notifier chain
+ *      blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
- *      @n: Pointer to root pointer of notifier chain
+ *      @nh: Pointer to head of the blocking notifier chain
+ *      @n: Entry to remove from notifier chain
+ *
+ *      Removes a notifier from a blocking notifier chain.
+ *      Must be called from process context.
+ *
+ *      Returns zero on success or %-ENOENT on failure.
+ */
+int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
+                struct notifier_block *n)
+{
+        int ret;
+        /*
+         * This code gets used during boot-up, when task switching is
+         * not yet working and interrupts must remain disabled.  At
+         * such times we must not call down_write().
+         */
+        if (unlikely(system_state == SYSTEM_BOOTING))
+                return notifier_chain_unregister(&nh->head, n);
+        down_write(&nh->rwsem);
+        ret = notifier_chain_unregister(&nh->head, n);
+        up_write(&nh->rwsem);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
+/**
+ *      blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ *      @nh: Pointer to head of the blocking notifier chain
 *      @val: Value passed unmodified to notifier function
 *      @v: Pointer passed unmodified to notifier function
 *
- *      Calls each function in a notifier chain in turn.
+ *      Calls each function in a notifier chain in turn.  The functions
+ *      run in a process context, so they are allowed to block.
 *
- *      If the return value of the notifier can be and'd
+ *      If the return value of the notifier can be and'ed
- *      with %NOTIFY_STOP_MASK, then notifier_call_chain
+ *      with %NOTIFY_STOP_MASK then blocking_notifier_call_chain
 *      will return immediately, with the return value of
 *      the notifier function which halted execution.
- *      Otherwise, the return value is the return value
+ *      Otherwise the return value is the return value
 *      of the last notifier function called.
 */
 
-int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+                unsigned long val, void *v)
 {
-        int ret=NOTIFY_DONE;
+        int ret;
-        struct notifier_block *nb = *n;
-        while(nb)
+        down_read(&nh->rwsem);
-        {
+        ret = notifier_call_chain(&nh->head, val, v);
-                ret=nb->notifier_call(nb,val,v);
+        up_read(&nh->rwsem);
-                if(ret&NOTIFY_STOP_MASK)
-                {
-                        return ret;
-                }
-                nb=nb->next;
-        }
        return ret;
 }
-EXPORT_SYMBOL(notifier_call_chain);
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
+/*
+ *      Raw notifier chain routines.  There is no protection;
+ *      the caller must provide it.  Use at your own risk!
+ */
+/**
+ *      raw_notifier_chain_register - Add notifier to a raw notifier chain
+ *      @nh: Pointer to head of the raw notifier chain
+ *      @n: New entry in notifier chain
+ *
+ *      Adds a notifier to a raw notifier chain.
+ *      All locking must be provided by the caller.
+ *
+ *      Currently always returns zero.
+ */
+int raw_notifier_chain_register(struct raw_notifier_head *nh,
+                struct notifier_block *n)
+{
+        return notifier_chain_register(&nh->head, n);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
+/**
+ *      raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
+ *      @nh: Pointer to head of the raw notifier chain
+ *      @n: Entry to remove from notifier chain
+ *
+ *      Removes a notifier from a raw notifier chain.
+ *      All locking must be provided by the caller.
+ *
+ *      Returns zero on success or %-ENOENT on failure.
+ */
+int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+                struct notifier_block *n)
+{
+        return notifier_chain_unregister(&nh->head, n);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
+/**
+ *      raw_notifier_call_chain - Call functions in a raw notifier chain
+ *      @nh: Pointer to head of the raw notifier chain
+ *      @val: Value passed unmodified to notifier function
+ *      @v: Pointer passed unmodified to notifier function
+ *
+ *      Calls each function in a notifier chain in turn.  The functions
+ *      run in an undefined context.
+ *      All locking must be provided by the caller.
+ *
+ *      If the return value of the notifier can be and'ed
+ *      with %NOTIFY_STOP_MASK then raw_notifier_call_chain
+ *      will return immediately, with the return value of
+ *      the notifier function which halted execution.
+ *      Otherwise the return value is the return value
+ *      of the last notifier function called.
+ */
+int raw_notifier_call_chain(struct raw_notifier_head *nh,
+                unsigned long val, void *v)
+{
+        return notifier_call_chain(&nh->head, val, v);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
 /**
 *      register_reboot_notifier - Register function to be called at reboot time
@@ -196,13 +401,13 @@ EXPORT_SYMBOL(notifier_call_chain);
 *      Registers a function with the list of functions
 *      to be called at reboot time.
 *
- *      Currently always returns zero, as notifier_chain_register
+ *      Currently always returns zero, as blocking_notifier_chain_register
 *      always returns zero.
 */
 
 int register_reboot_notifier(struct notifier_block * nb)
 {
-        return notifier_chain_register(&reboot_notifier_list, nb);
+        return blocking_notifier_chain_register(&reboot_notifier_list, nb);
 }
 EXPORT_SYMBOL(register_reboot_notifier);
@@ -219,7 +424,7 @@ EXPORT_SYMBOL(register_reboot_notifier);
 
 int unregister_reboot_notifier(struct notifier_block * nb)
 {
-        return notifier_chain_unregister(&reboot_notifier_list, nb);
+        return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
@@ -380,7 +585,7 @@ EXPORT_SYMBOL_GPL(emergency_restart);
 void kernel_restart_prepare(char *cmd)
 {
-        notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+        blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
        system_state = SYSTEM_RESTART;
        device_shutdown();
 }
@@ -430,7 +635,7 @@ EXPORT_SYMBOL_GPL(kernel_kexec);
 void kernel_shutdown_prepare(enum system_states state)
 {
-        notifier_call_chain(&reboot_notifier_list,
+        blocking_notifier_call_chain(&reboot_notifier_list,
                (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
        system_state = state;
        device_shutdown();
@@ -997,69 +1202,24 @@ asmlinkage long sys_times(struct tms __user * tbuf)
         */
        if (tbuf) {
                struct tms tmp;
+                struct task_struct *tsk = current;
+                struct task_struct *t;
                cputime_t utime, stime, cutime, cstime;
-#ifdef CONFIG_SMP
+                spin_lock_irq(&tsk->sighand->siglock);
-                if (thread_group_empty(current)) {
+                utime = tsk->signal->utime;
-                        /*
+                stime = tsk->signal->stime;
-                         * Single thread case without the use of any locks.
+                t = tsk;
-                         *
+                do {
-                         * We may race with release_task if two threads are
+                        utime = cputime_add(utime, t->utime);
-                         * executing. However, release task first adds up the
+                        stime = cputime_add(stime, t->stime);
-                         * counters (__exit_signal) before  removing the task
+                        t = next_thread(t);
-                         * from the process tasklist (__unhash_process).
+                } while (t != tsk);
-                         * __exit_signal also acquires and releases the
-                         * siglock which results in the proper memory ordering
-                         * so that the list modifications are always visible
-                         * after the counters have been updated.
-                         *
-                         * If the counters have been updated by the second thread
-                         * but the thread has not yet been removed from the list
-                         * then the other branch will be executing which will
-                         * block on tasklist_lock until the exit handling of the
-                         * other task is finished.
-                         *
-                         * This also implies that the sighand->siglock cannot
-                         * be held by another processor. So we can also
-                         * skip acquiring that lock.
-                         */
-                        utime = cputime_add(current->signal->utime, current->utime);
-                        stime = cputime_add(current->signal->utime, current->stime);
-                        cutime = current->signal->cutime;
-                        cstime = current->signal->cstime;
-                } else
-#endif
-                {
-                        /* Process with multiple threads */
-                        struct task_struct *tsk = current;
-                        struct task_struct *t;
-                        read_lock(&tasklist_lock);
+                cutime = tsk->signal->cutime;
-                        utime = tsk->signal->utime;
+                cstime = tsk->signal->cstime;
-                        stime = tsk->signal->stime;
+                spin_unlock_irq(&tsk->sighand->siglock);
-                        t = tsk;
-                        do {
-                                utime = cputime_add(utime, t->utime);
-                                stime = cputime_add(stime, t->stime);
-                                t = next_thread(t);
-                        } while (t != tsk);
-                        /*
-                         * While we have tasklist_lock read-locked, no dying thread
-                         * can be updating current->signal->[us]time.  Instead,
-                         * we got their counts included in the live thread loop.
-                         * However, another thread can come in right now and
-                         * do a wait call that updates current->signal->c[us]time.
-                         * To make sure we always see that pair updated atomically,
-                         * we take the siglock around fetching them.
-                         */
-                        spin_lock_irq(&tsk->sighand->siglock);
-                        cutime = tsk->signal->cutime;
-                        cstime = tsk->signal->cstime;
-                        spin_unlock_irq(&tsk->sighand->siglock);
-                        read_unlock(&tasklist_lock);
-                }
                tmp.tms_utime = cputime_to_clock_t(utime);
                tmp.tms_stime = cputime_to_clock_t(stime);
                tmp.tms_cutime = cputime_to_clock_t(cutime);
@@ -1212,18 +1372,29 @@ asmlinkage long sys_getsid(pid_t pid)
 asmlinkage long sys_setsid(void)
 {
        struct task_struct *group_leader = current->group_leader;
-        struct pid *pid;
+        pid_t session;
        int err = -EPERM;
        mutex_lock(&tty_mutex);
        write_lock_irq(&tasklist_lock);
-        pid = find_pid(PIDTYPE_PGID, group_leader->pid);
+        /* Fail if I am already a session leader */
-        if (pid)
+        if (group_leader->signal->leader)
+                goto out;
+        session = group_leader->pid;
+        /* Fail if a process group id already exists that equals the
+         * proposed session id.
+         *
+         * Don't check if session id == 1 because kernel threads use this
+         * session id and so the check will always fail and make it so
+         * init cannot successfully call setsid.
+         */
+        if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session))
                goto out;
        group_leader->signal->leader = 1;
-        __set_special_pids(group_leader->pid, group_leader->pid);
+        __set_special_pids(session, session);
        group_leader->signal->tty = NULL;
        group_leader->signal->tty_old_pgrp = 0;
        err = process_group(group_leader);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1067090db6..5433195040 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg);
 cond_syscall(sys_socketcall);
 cond_syscall(sys_futex);
 cond_syscall(compat_sys_futex);
+cond_syscall(sys_set_robust_list);
+cond_syscall(compat_sys_set_robust_list);
+cond_syscall(sys_get_robust_list);
+cond_syscall(compat_sys_get_robust_list);
 cond_syscall(sys_epoll_create);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
@@ -116,3 +120,15 @@ cond_syscall(sys32_sysctl);
 cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
+/* mmu depending weak syscall entries */
+cond_syscall(sys_mprotect);
+cond_syscall(sys_msync);
+cond_syscall(sys_mlock);
+cond_syscall(sys_munlock);
+cond_syscall(sys_mlockall);
+cond_syscall(sys_munlockall);
+cond_syscall(sys_mincore);
+cond_syscall(sys_madvise);
+cond_syscall(sys_mremap);
+cond_syscall(sys_remap_file_pages);
diff --git a/kernel/time.c b/kernel/time.c
index ff8e7019c4..b00ddc71ce 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -410,7 +410,7 @@ EXPORT_SYMBOL(current_kernel_time);
 * current_fs_time - Return FS time
 * @sb: Superblock.
 *
- * Return the current time truncated to the time granuality supported by
+ * Return the current time truncated to the time granularity supported by
 * the fs.
 */
 struct timespec current_fs_time(struct super_block *sb)
@@ -421,11 +421,11 @@ struct timespec current_fs_time(struct super_block *sb)
 EXPORT_SYMBOL(current_fs_time);
 /**
- * timespec_trunc - Truncate timespec to a granuality
+ * timespec_trunc - Truncate timespec to a granularity
 * @t: Timespec
- * @gran: Granuality in ns.
+ * @gran: Granularity in ns.
 *
- * Truncate a timespec to a granuality. gran must be smaller than a second.
+ * Truncate a timespec to a granularity. gran must be smaller than a second.
 * Always rounds down.
 *
 * This function should be only used for timestamps returned by
diff --git a/kernel/timer.c b/kernel/timer.c
index ab189dd187..9e49deed46 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(jiffies_64);
 /*
 * per-CPU timer vector definitions:
 */
 #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
 #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
 #define TVN_SIZE (1 << TVN_BITS)
@@ -62,11 +61,6 @@ EXPORT_SYMBOL(jiffies_64);
 #define TVN_MASK (TVN_SIZE - 1)
 #define TVR_MASK (TVR_SIZE - 1)
-struct timer_base_s {
-        spinlock_t lock;
-        struct timer_list *running_timer;
-};
 typedef struct tvec_s {
        struct list_head vec[TVN_SIZE];
 } tvec_t;
@@ -76,7 +70,8 @@ typedef struct tvec_root_s {
 } tvec_root_t;
 struct tvec_t_base_s {
-        struct timer_base_s t_base;
+        spinlock_t lock;
+        struct timer_list *running_timer;
        unsigned long timer_jiffies;
        tvec_root_t tv1;
        tvec_t tv2;
@@ -86,14 +81,16 @@ struct tvec_t_base_s {
 } ____cacheline_aligned_in_smp;
 typedef struct tvec_t_base_s tvec_base_t;
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
-static tvec_base_t boot_tvec_bases;
+tvec_base_t boot_tvec_bases;
+EXPORT_SYMBOL(boot_tvec_bases);
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
 static inline void set_running_timer(tvec_base_t *base,
                                        struct timer_list *timer)
 {
 #ifdef CONFIG_SMP
-        base->t_base.running_timer = timer;
+        base->running_timer = timer;
 #endif
 }
@@ -139,15 +136,6 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
        list_add_tail(&timer->entry, vec);
 }
-typedef struct timer_base_s timer_base_t;
-/*
- * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
- * at compile time, and we need timer->base to lock the timer.
- */
-timer_base_t __init_timer_base
-        ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
-EXPORT_SYMBOL(__init_timer_base);
 /***
 * init_timer - initialize a timer.
 * @timer: the timer to be initialized
@@ -158,7 +146,7 @@ EXPORT_SYMBOL(__init_timer_base);
 void fastcall init_timer(struct timer_list *timer)
 {
        timer->entry.next = NULL;
-        timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base;
+        timer->base = per_cpu(tvec_bases, raw_smp_processor_id());
 }
 EXPORT_SYMBOL(init_timer);
@@ -174,7 +162,7 @@ static inline void detach_timer(struct timer_list *timer,
 }
 /*
- * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
+ * We are using hashed locking: holding per_cpu(tvec_bases).lock
 * means that all timers which are tied to this base via timer->base are
 * locked, and the base itself is locked too.
 *
@@ -185,10 +173,10 @@ static inline void detach_timer(struct timer_list *timer,
 * possible to set timer->base = NULL and drop the lock: the timer remains
 * locked.
 */
-static timer_base_t *lock_timer_base(struct timer_list *timer,
+static tvec_base_t *lock_timer_base(struct timer_list *timer,
                                        unsigned long *flags)
 {
-        timer_base_t *base;
+        tvec_base_t *base;
        for (;;) {
                base = timer->base;
@@ -205,8 +193,7 @@ static timer_base_t *lock_timer_base(struct timer_list *timer,
 int __mod_timer(struct timer_list *timer, unsigned long expires)
 {
-        timer_base_t *base;
+        tvec_base_t *base, *new_base;
-        tvec_base_t *new_base;
        unsigned long flags;
        int ret = 0;
@@ -221,7 +208,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
        new_base = __get_cpu_var(tvec_bases);
-        if (base != &new_base->t_base) {
+        if (base != new_base) {
                /*
                 * We are trying to schedule the timer on the local CPU.
                 * However we can't change timer's base while it is running,
@@ -229,21 +216,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
                 * handler yet has not finished. This also guarantees that
                 * the timer is serialized wrt itself.
                 */
-                if (unlikely(base->running_timer == timer)) {
+                if (likely(base->running_timer != timer)) {
-                        /* The timer remains on a former base */
-                        new_base = container_of(base, tvec_base_t, t_base);
-                } else {
                        /* See the comment in lock_timer_base() */
                        timer->base = NULL;
                        spin_unlock(&base->lock);
-                        spin_lock(&new_base->t_base.lock);
+                        base = new_base;
-                        timer->base = &new_base->t_base;
+                        spin_lock(&base->lock);
+                        timer->base = base;
                }
        }
        timer->expires = expires;
-        internal_add_timer(new_base, timer);
+        internal_add_timer(base, timer);
-        spin_unlock_irqrestore(&new_base->t_base.lock, flags);
+        spin_unlock_irqrestore(&base->lock, flags);
        return ret;
 }
@@ -263,10 +248,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
        unsigned long flags;
        BUG_ON(timer_pending(timer) || !timer->function);
-        spin_lock_irqsave(&base->t_base.lock, flags);
+        spin_lock_irqsave(&base->lock, flags);
-        timer->base = &base->t_base;
+        timer->base = base;
        internal_add_timer(base, timer);
-        spin_unlock_irqrestore(&base->t_base.lock, flags);
+        spin_unlock_irqrestore(&base->lock, flags);
 }
@@ -319,7 +304,7 @@ EXPORT_SYMBOL(mod_timer);
 */
 int del_timer(struct timer_list *timer)
 {
-        timer_base_t *base;
+        tvec_base_t *base;
        unsigned long flags;
        int ret = 0;
@@ -346,7 +331,7 @@ EXPORT_SYMBOL(del_timer);
 */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
-        timer_base_t *base;
+        tvec_base_t *base;
        unsigned long flags;
        int ret = -1;
@@ -410,7 +395,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
                struct timer_list *tmp;
                tmp = list_entry(curr, struct timer_list, entry);
-                BUG_ON(tmp->base != &base->t_base);
+                BUG_ON(tmp->base != base);
                curr = curr->next;
                internal_add_timer(base, tmp);
        }
@@ -432,7 +417,7 @@ static inline void __run_timers(tvec_base_t *base)
 {
        struct timer_list *timer;
-        spin_lock_irq(&base->t_base.lock);
+        spin_lock_irq(&base->lock);
        while (time_after_eq(jiffies, base->timer_jiffies)) {
                struct list_head work_list = LIST_HEAD_INIT(work_list);
                struct list_head *head = &work_list;
@@ -458,7 +443,7 @@ static inline void __run_timers(tvec_base_t *base)
                        set_running_timer(base, timer);
                        detach_timer(timer, 1);
-                        spin_unlock_irq(&base->t_base.lock);
+                        spin_unlock_irq(&base->lock);
                        {
                                int preempt_count = preempt_count();
                                fn(data);
@@ -471,11 +456,11 @@ static inline void __run_timers(tvec_base_t *base)
                                        BUG();
                                }
                        }
-                        spin_lock_irq(&base->t_base.lock);
+                        spin_lock_irq(&base->lock);
                }
        }
        set_running_timer(base, NULL);
-        spin_unlock_irq(&base->t_base.lock);
+        spin_unlock_irq(&base->lock);
 }
 #ifdef CONFIG_NO_IDLE_HZ
@@ -506,7 +491,7 @@ unsigned long next_timer_interrupt(void)
        hr_expires += jiffies;
        base = __get_cpu_var(tvec_bases);
-        spin_lock(&base->t_base.lock);
+        spin_lock(&base->lock);
        expires = base->timer_jiffies + (LONG_MAX >> 1);
        list = NULL;
@@ -554,7 +539,23 @@ found:
                                expires = nte->expires;
                }
        }
-        spin_unlock(&base->t_base.lock);
+        spin_unlock(&base->lock);
+        /*
+         * It can happen that other CPUs service timer IRQs and increment
+         * jiffies, but we have not yet got a local timer tick to process
+         * the timer wheels.  In that case, the expiry time can be before
+         * jiffies, but since the high-resolution timer here is relative to
+         * jiffies, the default expression when high-resolution timers are
+         * not active,
+         *
+         *   time_before(MAX_JIFFY_OFFSET + jiffies, expires)
+         *
+         * would falsely evaluate to true.  If that is the case, just
+         * return jiffies so that we can immediately fire the local timer
+         */
+        if (time_before(expires, jiffies))
+                return jiffies;
        if (time_before(hr_expires, expires))
                return hr_expires;
@@ -841,7 +842,7 @@ void update_process_times(int user_tick)
 */
 static unsigned long count_active_tasks(void)
 {
-        return (nr_running() + nr_uninterruptible()) * FIXED_1;
+        return nr_active() * FIXED_1;
 }
 /*
@@ -1240,29 +1241,37 @@ static int __devinit init_timers_cpu(int cpu)
 {
        int j;
        tvec_base_t *base;
+        static char __devinitdata tvec_base_done[NR_CPUS];
-        base = per_cpu(tvec_bases, cpu);
+        if (!tvec_base_done[cpu]) {
-        if (!base) {
                static char boot_done;
-                /*
-                 * Cannot do allocation in init_timers as that runs before the
-                 * allocator initializes (and would waste memory if there are
-                 * more possible CPUs than will ever be installed/brought up).
-                 */
                if (boot_done) {
+                        /*
+                         * The APs use this path later in boot
+                         */
                        base = kmalloc_node(sizeof(*base), GFP_KERNEL,
                                                cpu_to_node(cpu));
                        if (!base)
                                return -ENOMEM;
                        memset(base, 0, sizeof(*base));
+                        per_cpu(tvec_bases, cpu) = base;
                } else {
-                        base = &boot_tvec_bases;
+                        /*
+                         * This is for the boot CPU - we use compile-time
+                         * static initialisation because per-cpu memory isn't
+                         * ready yet and because the memory allocators are not
+                         * initialised either.
+                         */
                        boot_done = 1;
+                        base = &boot_tvec_bases;
                }
-                per_cpu(tvec_bases, cpu) = base;
+                tvec_base_done[cpu] = 1;
+        } else {
+                base = per_cpu(tvec_bases, cpu);
        }
-        spin_lock_init(&base->t_base.lock);
+        spin_lock_init(&base->lock);
        for (j = 0; j < TVN_SIZE; j++) {
                INIT_LIST_HEAD(base->tv5.vec + j);
                INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1284,7 +1293,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
        while (!list_empty(head)) {
                timer = list_entry(head->next, struct timer_list, entry);
                detach_timer(timer, 0);
-                timer->base = &new_base->t_base;
+                timer->base = new_base;
                internal_add_timer(new_base, timer);
        }
 }
@@ -1300,11 +1309,11 @@ static void __devinit migrate_timers(int cpu)
        new_base = get_cpu_var(tvec_bases);
        local_irq_disable();
-        spin_lock(&new_base->t_base.lock);
+        spin_lock(&new_base->lock);
-        spin_lock(&old_base->t_base.lock);
+        spin_lock(&old_base->lock);
+        BUG_ON(old_base->running_timer);
-        if (old_base->t_base.running_timer)
-                BUG();
        for (i = 0; i < TVR_SIZE; i++)
                migrate_timer_list(new_base, old_base->tv1.vec + i);
        for (i = 0; i < TVN_SIZE; i++) {
@@ -1314,14 +1323,14 @@ static void __devinit migrate_timers(int cpu)
                migrate_timer_list(new_base, old_base->tv5.vec + i);
        }
-        spin_unlock(&old_base->t_base.lock);
+        spin_unlock(&old_base->lock);
-        spin_unlock(&new_base->t_base.lock);
+        spin_unlock(&new_base->lock);
        local_irq_enable();
        put_cpu_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit timer_cpu_notify(struct notifier_block *self, 
+static int timer_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -1341,7 +1350,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata timers_nb = {
+static struct notifier_block timers_nb = {
        .notifier_call  = timer_cpu_notify,
 };
@@ -1471,7 +1480,7 @@ static void time_interpolator_update(long delta_nsec)
         */
        if (jiffies % INTERPOLATOR_ADJUST == 0)
        {
-                if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
+                if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec)
                        time_interpolator->nsec_per_cyc--;
                if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
                        time_interpolator->nsec_per_cyc++;
@@ -1495,8 +1504,7 @@ register_time_interpolator(struct time_interpolator *ti)
        unsigned long flags;
        /* Sanity check */
-        if (ti->frequency == 0 || ti->mask == 0)
+        BUG_ON(ti->frequency == 0 || ti->mask == 0);
-                BUG();
        ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
        spin_lock(&time_interpolator_lock);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index aa25605027..187e2a4238 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -20,43 +20,67 @@
 asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group)
 {
-        return sys_chown(filename, low2highuid(user), low2highgid(group));
+        long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group)
 {
-        return sys_lchown(filename, low2highuid(user), low2highgid(group));
+        long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
 {
-        return sys_fchown(fd, low2highuid(user), low2highgid(group));
+        long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
 {
-        return sys_setregid(low2highgid(rgid), low2highgid(egid));
+        long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_setgid16(old_gid_t gid)
 {
-        return sys_setgid(low2highgid(gid));
+        long ret = sys_setgid(low2highgid(gid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
 {
-        return sys_setreuid(low2highuid(ruid), low2highuid(euid));
+        long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_setuid16(old_uid_t uid)
 {
-        return sys_setuid(low2highuid(uid));
+        long ret = sys_setuid(low2highuid(uid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
 {
-        return sys_setresuid(low2highuid(ruid), low2highuid(euid),
+        long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
-                low2highuid(suid));
+                                 low2highuid(suid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
@@ -72,8 +96,11 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid,
 asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
 {
-        return sys_setresgid(low2highgid(rgid), low2highgid(egid),
+        long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
-                low2highgid(sgid));
+                                 low2highgid(sgid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
@@ -89,12 +116,18 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid,
 asmlinkage long sys_setfsuid16(old_uid_t uid)
 {
-        return sys_setfsuid(low2highuid(uid));
+        long ret = sys_setfsuid(low2highuid(uid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_setfsgid16(old_gid_t gid)
 {
-        return sys_setfsgid(low2highgid(gid));
+        long ret = sys_setfsgid(low2highgid(gid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 static int groups16_to_user(old_gid_t __user *grouplist,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e9e464a903..880fb415a8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -547,7 +547,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 }
 /* We're holding the cpucontrol mutex here */
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
+static int workqueue_cpu_callback(struct notifier_block *nfb,
                                  unsigned long action,
                                  void *hcpu)
 {