61 files changed, 7186 insertions, 3413 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 4ae0fbde81..58908f9d15 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,9 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_FUTEX) += futex.o
+ifeq ($(CONFIG_COMPAT),y)
+obj-$(CONFIG_FUTEX) += futex_compat.o
+endif
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
@@ -26,7 +29,7 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
-obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
@@ -34,6 +37,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_RELAY) += relay.o
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 065d8b4e51..b327f4d201 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -449,8 +449,8 @@ static void do_acct_process(long exitcode, struct file *file)
        /* calculate run_time in nsec*/
        do_posix_clock_monotonic_gettime(&uptime);
        run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
-        run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
+        run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
-                                        + current->start_time.tv_nsec;
+                       + current->group_leader->start_time.tv_nsec;
        /* convert nsec -> AHZ */
        elapsed = nsec_to_AHZ(run_time);
 #if ACCT_VERSION==3
@@ -469,10 +469,10 @@ static void do_acct_process(long exitcode, struct file *file)
 #endif
        do_div(elapsed, AHZ);
        ac.ac_btime = xtime.tv_sec - elapsed;
-        jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime,
+        jiffies = cputime_to_jiffies(cputime_add(current->utime,
                                                 current->signal->utime));
        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
-        jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime,
+        jiffies = cputime_to_jiffies(cputime_add(current->stime,
                                                 current->signal->stime));
        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
        /* we really need to bite the bullet and change layout */
@@ -522,9 +522,9 @@ static void do_acct_process(long exitcode, struct file *file)
        ac.ac_io = encode_comp_t(0 /* current->io_usage */);    /* %% */
        ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
        ac.ac_minflt = encode_comp_t(current->signal->min_flt +
-                                     current->group_leader->min_flt);
+                                     current->min_flt);
        ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
-                                     current->group_leader->maj_flt);
+                                     current->maj_flt);
        ac.ac_swaps = encode_comp_t(0);
        ac.ac_exitcode = exitcode;
diff --git a/kernel/audit.c b/kernel/audit.c
index 0a813d2883..df57b493e1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -52,8 +52,12 @@
 #include <linux/audit.h>
 #include <net/sock.h>
+#include <net/netlink.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
+#include <linux/selinux.h>
+#include "audit.h"
 /* No auditing will take place until audit_initialized != 0.
 * (Initialization happens after skb_init is called.) */
@@ -72,7 +76,7 @@ static int	audit_failure = AUDIT_FAIL_PRINTK;
 * contains the (non-zero) pid. */
 int             audit_pid;
-/* If audit_limit is non-zero, limit the rate of sending audit records
+/* If audit_rate_limit is non-zero, limit the rate of sending audit records
 * to that number per second.  This prevents DoS attacks, but results in
 * audit records being dropped. */
 static int      audit_rate_limit;
@@ -102,7 +106,7 @@ static struct sock *audit_sock;
 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
 * being placed on the freelist). */
 static DEFINE_SPINLOCK(audit_freelist_lock);
-static int         audit_freelist_count = 0;
+static int         audit_freelist_count;
 static LIST_HEAD(audit_freelist);
 static struct sk_buff_head audit_skb_queue;
@@ -113,7 +117,7 @@ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
 /* The netlink socket is only to be read by 1 CPU, which lets us assume
 * that list additions and deletions never happen simultaneously in
 * auditsc.c */
-DECLARE_MUTEX(audit_netlink_sem);
+DEFINE_MUTEX(audit_netlink_mutex);
 /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
 * audit records.  Since printk uses a 1024 byte buffer, this buffer
@@ -142,7 +146,7 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
        nlh->nlmsg_pid = pid;
 }
-static void audit_panic(const char *message)
+void audit_panic(const char *message)
 {
        switch (audit_failure)
        {
@@ -186,8 +190,14 @@ static inline int audit_rate_check(void)
        return retval;
 }
-/* Emit at least 1 message per second, even if audit_rate_check is
+/**
- * throttling. */
+ * audit_log_lost - conditionally log lost audit message event
+ * @message: the message stating reason for lost audit message
+ *
+ * Emit at least 1 message per second, even if audit_rate_check is
+ * throttling.
+ * Always increment the lost messages counter.
+*/
 void audit_log_lost(const char *message)
 {
        static unsigned long    last_msg = 0;
@@ -218,52 +228,105 @@ void audit_log_lost(const char *message)
                       audit_backlog_limit);
                audit_panic(message);
        }
 }
-static int audit_set_rate_limit(int limit, uid_t loginuid)
+static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
 {
-        int old          = audit_rate_limit;
+        int old = audit_rate_limit;
-        audit_rate_limit = limit;
-        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 
+        if (sid) {
+                char *ctx = NULL;
+                u32 len;
+                int rc;
+                if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+                        return rc;
+                else
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                "audit_rate_limit=%d old=%d by auid=%u subj=%s",
+                                limit, old, loginuid, ctx);
+                kfree(ctx);
+        } else
+                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
                        "audit_rate_limit=%d old=%d by auid=%u",
-                        audit_rate_limit, old, loginuid);
+                        limit, old, loginuid);
+        audit_rate_limit = limit;
        return old;
 }
-static int audit_set_backlog_limit(int limit, uid_t loginuid)
+static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
 {
-        int old          = audit_backlog_limit;
+        int old = audit_backlog_limit;
-        audit_backlog_limit = limit;
-        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+        if (sid) {
+                char *ctx = NULL;
+                u32 len;
+                int rc;
+                if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+                        return rc;
+                else
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                            "audit_backlog_limit=%d old=%d by auid=%u subj=%s",
+                                limit, old, loginuid, ctx);
+                kfree(ctx);
+        } else
+                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
                        "audit_backlog_limit=%d old=%d by auid=%u",
-                        audit_backlog_limit, old, loginuid);
+                        limit, old, loginuid);
+        audit_backlog_limit = limit;
        return old;
 }
-static int audit_set_enabled(int state, uid_t loginuid)
+static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
 {
-        int old          = audit_enabled;
+        int old = audit_enabled;
        if (state != 0 && state != 1)
                return -EINVAL;
-        audit_enabled = state;
-        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+        if (sid) {
+                char *ctx = NULL;
+                u32 len;
+                int rc;
+                if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+                        return rc;
+                else
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                "audit_enabled=%d old=%d by auid=%u subj=%s",
+                                state, old, loginuid, ctx);
+                kfree(ctx);
+        } else
+                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
                        "audit_enabled=%d old=%d by auid=%u",
-                        audit_enabled, old, loginuid);
+                        state, old, loginuid);
+        audit_enabled = state;
        return old;
 }
-static int audit_set_failure(int state, uid_t loginuid)
+static int audit_set_failure(int state, uid_t loginuid, u32 sid)
 {
-        int old          = audit_failure;
+        int old = audit_failure;
        if (state != AUDIT_FAIL_SILENT
            && state != AUDIT_FAIL_PRINTK
            && state != AUDIT_FAIL_PANIC)
                return -EINVAL;
-        audit_failure = state;
-        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+        if (sid) {
+                char *ctx = NULL;
+                u32 len;
+                int rc;
+                if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+                        return rc;
+                else
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                "audit_failure=%d old=%d by auid=%u subj=%s",
+                                state, old, loginuid, ctx);
+                kfree(ctx);
+        } else
+                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
                        "audit_failure=%d old=%d by auid=%u",
-                        audit_failure, old, loginuid);
+                        state, old, loginuid);
+        audit_failure = state;
        return old;
 }
@@ -300,8 +363,22 @@ static int kauditd_thread(void *dummy)
                        remove_wait_queue(&kauditd_wait, &wait);
                }
        }
+        return 0;
 }
+/**
+ * audit_send_reply - send an audit reply message via netlink
+ * @pid: process id to send reply to
+ * @seq: sequence number
+ * @type: audit message type
+ * @done: done (last) flag
+ * @multi: multi-part message flag
+ * @payload: payload data
+ * @size: payload size
+ *
+ * Allocates an skb, builds the netlink message, and sends it to the pid.
+ * No failure notifications.
+ */
 void audit_send_reply(int pid, int seq, int type, int done, int multi,
                      void *payload, int size)
 {
@@ -342,15 +419,19 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
        switch (msg_type) {
        case AUDIT_GET:
        case AUDIT_LIST:
+        case AUDIT_LIST_RULES:
        case AUDIT_SET:
        case AUDIT_ADD:
+        case AUDIT_ADD_RULE:
        case AUDIT_DEL:
+        case AUDIT_DEL_RULE:
        case AUDIT_SIGNAL_INFO:
                if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
                        err = -EPERM;
                break;
        case AUDIT_USER:
        case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+        case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
                if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
                        err = -EPERM;
                break;
@@ -363,7 +444,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
 static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
-        u32                     uid, pid, seq;
+        u32                     uid, pid, seq, sid;
        void                    *data;
        struct audit_status     *status_get, status_set;
        int                     err;
@@ -376,7 +457,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        if (err)
                return err;
-        /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */
+        /* As soon as there's any sign of userspace auditd,
+         * start kauditd to talk to it */
        if (!kauditd_task)
                kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
        if (IS_ERR(kauditd_task)) {
@@ -388,6 +470,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        pid  = NETLINK_CREDS(skb)->pid;
        uid  = NETLINK_CREDS(skb)->uid;
        loginuid = NETLINK_CB(skb).loginuid;
+        sid  = NETLINK_CB(skb).sid;
        seq  = nlh->nlmsg_seq;
        data = NLMSG_DATA(nlh);
@@ -408,28 +491,47 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        return -EINVAL;
                status_get   = (struct audit_status *)data;
                if (status_get->mask & AUDIT_STATUS_ENABLED) {
-                        err = audit_set_enabled(status_get->enabled, loginuid);
+                        err = audit_set_enabled(status_get->enabled,
+                                                        loginuid, sid);
                        if (err < 0) return err;
                }
                if (status_get->mask & AUDIT_STATUS_FAILURE) {
-                        err = audit_set_failure(status_get->failure, loginuid);
+                        err = audit_set_failure(status_get->failure,
+                                                         loginuid, sid);
                        if (err < 0) return err;
                }
                if (status_get->mask & AUDIT_STATUS_PID) {
                        int old   = audit_pid;
+                        if (sid) {
+                                char *ctx = NULL;
+                                u32 len;
+                                int rc;
+                                if ((rc = selinux_ctxid_to_string(
+                                                sid, &ctx, &len)))
+                                        return rc;
+                                else
+                                        audit_log(NULL, GFP_KERNEL,
+                                                AUDIT_CONFIG_CHANGE,
+                                                "audit_pid=%d old=%d by auid=%u subj=%s",
+                                                status_get->pid, old,
+                                                loginuid, ctx);
+                                kfree(ctx);
+                        } else
+                                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                        "audit_pid=%d old=%d by auid=%u",
+                                          status_get->pid, old, loginuid);
                        audit_pid = status_get->pid;
-                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-                                "audit_pid=%d old=%d by auid=%u",
-                                  audit_pid, old, loginuid);
                }
                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
-                        audit_set_rate_limit(status_get->rate_limit, loginuid);
+                        audit_set_rate_limit(status_get->rate_limit,
+                                                         loginuid, sid);
                if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
                        audit_set_backlog_limit(status_get->backlog_limit,
-                                                        loginuid);
+                                                        loginuid, sid);
                break;
        case AUDIT_USER:
        case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+        case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
                if (!audit_enabled && msg_type != AUDIT_USER_AVC)
                        return 0;
@@ -439,8 +541,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
                        if (ab) {
                                audit_log_format(ab,
-                                                 "user pid=%d uid=%u auid=%u msg='%.1024s'",
+                                                 "user pid=%d uid=%u auid=%u",
-                                                 pid, uid, loginuid, (char *)data);
+                                                 pid, uid, loginuid);
+                                if (sid) {
+                                        char *ctx = NULL;
+                                        u32 len;
+                                        if (selinux_ctxid_to_string(
+                                                        sid, &ctx, &len)) {
+                                                audit_log_format(ab, 
+                                                        " ssid=%u", sid);
+                                                /* Maybe call audit_panic? */
+                                        } else
+                                                audit_log_format(ab, 
+                                                        " subj=%s", ctx);
+                                        kfree(ctx);
+                                }
+                                audit_log_format(ab, " msg='%.1024s'",
+                                         (char *)data);
                                audit_set_pid(ab, pid);
                                audit_log_end(ab);
                        }
@@ -448,12 +565,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                break;
        case AUDIT_ADD:
        case AUDIT_DEL:
-                if (nlh->nlmsg_len < sizeof(struct audit_rule))
+                if (nlmsg_len(nlh) < sizeof(struct audit_rule))
                        return -EINVAL;
                /* fallthrough */
        case AUDIT_LIST:
                err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
-                                           uid, seq, data, loginuid);
+                                           uid, seq, data, nlmsg_len(nlh),
+                                           loginuid, sid);
+                break;
+        case AUDIT_ADD_RULE:
+        case AUDIT_DEL_RULE:
+                if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
+                        return -EINVAL;
+                /* fallthrough */
+        case AUDIT_LIST_RULES:
+                err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+                                           uid, seq, data, nlmsg_len(nlh),
+                                           loginuid, sid);
                break;
        case AUDIT_SIGNAL_INFO:
                sig_data.uid = audit_sig_uid;
@@ -469,9 +597,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        return err < 0 ? err : 0;
 }
-/* Get message from skb (based on rtnetlink_rcv_skb).  Each message is
+/*
+ * Get message from skb (based on rtnetlink_rcv_skb).  Each message is
 * processed by audit_receive_msg.  Malformed skbs with wrong length are
- * discarded silently.  */
+ * discarded silently.
+ */
 static void audit_receive_skb(struct sk_buff *skb)
 {
        int             err;
@@ -499,14 +629,14 @@ static void audit_receive(struct sock *sk, int length)
        struct sk_buff  *skb;
        unsigned int qlen;
-        down(&audit_netlink_sem);
+        mutex_lock(&audit_netlink_mutex);
        for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
                skb = skb_dequeue(&sk->sk_receive_queue);
                audit_receive_skb(skb);
                kfree_skb(skb);
        }
-        up(&audit_netlink_sem);
+        mutex_unlock(&audit_netlink_mutex);
 }
@@ -519,11 +649,17 @@ static int __init audit_init(void)
                                           THIS_MODULE);
        if (!audit_sock)
                audit_panic("cannot initialize netlink socket");
+        else
+                audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
-        audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
        skb_queue_head_init(&audit_skb_queue);
        audit_initialized = 1;
        audit_enabled = audit_default;
+        /* Register the callback with selinux.  This callback will be invoked
+         * when a new policy is loaded. */
+        selinux_audit_set_callback(&selinux_audit_rule_update);
        audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
        return 0;
 }
@@ -538,7 +674,7 @@ static int __init audit_enable(char *str)
               audit_initialized ? "" : " (after initialization)");
        if (audit_initialized)
                audit_enabled = audit_default;
-        return 0;
+        return 1;
 }
 __setup("audit=", audit_enable);
@@ -600,7 +736,10 @@ err:
        return NULL;
 }
-/* Compute a serial number for the audit record.  Audit records are
+/**
+ * audit_serial - compute a serial number for the audit record
+ *
+ * Compute a serial number for the audit record.  Audit records are
 * written to user-space as soon as they are generated, so a complete
 * audit record may be written in several pieces.  The timestamp of the
 * record and this serial number are used by the user-space tools to
@@ -612,8 +751,8 @@ err:
 * audit context (for those records that have a context), and emit them
 * all at syscall exit.  However, this could delay the reporting of
 * significant errors until syscall exit (or never, if the system
- * halts). */
+ * halts).
+ */
 unsigned int audit_serial(void)
 {
        static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
@@ -649,6 +788,21 @@ static inline void audit_get_stamp(struct audit_context *ctx,
 * will be written at syscall exit.  If there is no associated task, tsk
 * should be NULL. */
+/**
+ * audit_log_start - obtain an audit buffer
+ * @ctx: audit_context (may be NULL)
+ * @gfp_mask: type of allocation
+ * @type: audit message type
+ *
+ * Returns audit_buffer pointer on success or NULL on error.
+ *
+ * Obtain an audit buffer.  This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format.  If the task (ctx) is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit.  If there is no associated task, then
+ * task context (ctx) should be NULL.
+ */
 struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
                                     int type)
 {
@@ -661,6 +815,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
        if (!audit_initialized)
                return NULL;
+        if (unlikely(audit_filter_type(type)))
+                return NULL;
        if (gfp_mask & __GFP_WAIT)
                reserve = 0;
        else
@@ -713,6 +870,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 /**
 * audit_expand - expand skb in the audit buffer
 * @ab: audit_buffer
+ * @extra: space to add at tail of the skb
 *
 * Returns 0 (no space) on failed expansion, or available space if
 * successful.
@@ -729,10 +887,12 @@ static inline int audit_expand(struct audit_buffer *ab, int extra)
        return skb_tailroom(skb);
 }
-/* Format an audit message into the audit buffer.  If there isn't enough
+/*
+ * Format an audit message into the audit buffer.  If there isn't enough
 * room in the audit buffer, more room will be allocated and vsnprint
 * will be called a second time.  Currently, we assume that a printk
- * can't format message larger than 1024 bytes, so we don't either. */
+ * can't format message larger than 1024 bytes, so we don't either.
+ */
 static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
                              va_list args)
 {
@@ -757,7 +917,8 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
                /* The printk buffer is 1024 bytes long, so if we get
                 * here and AUDIT_BUFSIZ is at least 1024, then we can
                 * log everything that printk could have logged. */
-                avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
+                avail = audit_expand(ab,
+                        max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
                if (!avail)
                        goto out;
                len = vsnprintf(skb->tail, avail, fmt, args2);
@@ -768,8 +929,14 @@ out:
        return;
 }
-/* Format a message into the audit buffer.  All the work is done in
+/**
- * audit_log_vformat. */
+ * audit_log_format - format a message into the audit buffer.
+ * @ab: audit_buffer
+ * @fmt: format string
+ * @...: optional parameters matching @fmt string
+ *
+ * All the work is done in audit_log_vformat.
+ */
 void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
 {
        va_list args;
@@ -781,9 +948,18 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
        va_end(args);
 }
-/* This function will take the passed buf and convert it into a string of
+/**
- * ascii hex digits. The new string is placed onto the skb. */
+ * audit_log_hex - convert a buffer to hex and append it to the audit skb
-void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, 
+ * @ab: the audit_buffer
+ * @buf: buffer to convert to hex
+ * @len: length of @buf to be converted
+ *
+ * No return value; failure to expand is silently ignored.
+ *
+ * This function will take the passed buf and convert it into a string of
+ * ascii hex digits. The new string is placed onto the skb.
+ */
+void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
                size_t len)
 {
        int i, avail, new_len;
@@ -812,10 +988,16 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
        skb_put(skb, len << 1); /* new string is twice the old string */
 }
-/* This code will escape a string that is passed to it if the string
+/**
- * contains a control character, unprintable character, double quote mark, 
+ * audit_log_unstrustedstring - log a string that may contain random characters
+ * @ab: audit_buffer
+ * @string: string to be logged
+ *
+ * This code will escape a string that is passed to it if the string
+ * contains a control character, unprintable character, double quote mark,
 * or a space. Unescaped strings will start and end with a double quote mark.
- * Strings that are escaped are printed in hex (2 digits per char). */
+ * Strings that are escaped are printed in hex (2 digits per char).
+ */
 void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 {
        const unsigned char *p = string;
@@ -854,10 +1036,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
        kfree(path);
 }
-/* The netlink_* functions cannot be called inside an irq context, so
+/**
- * the audit buffer is places on a queue and a tasklet is scheduled to
+ * audit_log_end - end one audit record
+ * @ab: the audit_buffer
+ *
+ * The netlink_* functions cannot be called inside an irq context, so
+ * the audit buffer is placed on a queue and a tasklet is scheduled to
 * remove them from the queue outside the irq context.  May be called in
- * any context. */
+ * any context.
+ */
 void audit_log_end(struct audit_buffer *ab)
 {
        if (!ab)
@@ -878,9 +1065,18 @@ void audit_log_end(struct audit_buffer *ab)
        audit_buffer_free(ab);
 }
-/* Log an audit record.  This is a convenience function that calls
+/**
- * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
+ * audit_log - Log an audit record
- * called in any context. */
+ * @ctx: audit context
+ * @gfp_mask: type of allocation
+ * @type: audit message type
+ * @fmt: format string to use
+ * @...: variable parameters matching the format string
+ *
+ * This is a convenience function that calls audit_log_start,
+ * audit_log_vformat, and audit_log_end.  It may be called
+ * in any context.
+ */
 void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, 
               const char *fmt, ...)
 {
@@ -895,3 +1091,8 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
                audit_log_end(ab);
        }
 }
+EXPORT_SYMBOL(audit_log_start);
+EXPORT_SYMBOL(audit_log_end);
+EXPORT_SYMBOL(audit_log_format);
+EXPORT_SYMBOL(audit_log);
diff --git a/kernel/audit.h b/kernel/audit.h
new file mode 100644
index 0000000000..6f733920fd
--- /dev/null
+++ b/kernel/audit.h
@@ -0,0 +1,92 @@
+/* audit -- definition of audit_context structure and supporting types 
+ *
+ * Copyright 2003-2004 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/audit.h>
+/* 0 = no checking
+   1 = put_count checking
+   2 = verbose put_count checking
+*/
+#define AUDIT_DEBUG 0
+/* At task start time, the audit_state is set in the audit_context using
+   a per-task filter.  At syscall entry, the audit_state is augmented by
+   the syscall filter. */
+enum audit_state {
+        AUDIT_DISABLED,         /* Do not create per-task audit_context.
+                                 * No syscall-specific audit records can
+                                 * be generated. */
+        AUDIT_SETUP_CONTEXT,    /* Create the per-task audit_context,
+                                 * but don't necessarily fill it in at
+                                 * syscall entry time (i.e., filter
+                                 * instead). */
+        AUDIT_BUILD_CONTEXT,    /* Create the per-task audit_context,
+                                 * and always fill it in at syscall
+                                 * entry time.  This makes a full
+                                 * syscall record available if some
+                                 * other part of the kernel decides it
+                                 * should be recorded. */
+        AUDIT_RECORD_CONTEXT    /* Create the per-task audit_context,
+                                 * always fill it in at syscall entry
+                                 * time, and always write out the audit
+                                 * record at syscall exit time.  */
+};
+/* Rule lists */
+struct audit_field {
+        u32                             type;
+        u32                             val;
+        u32                             op;
+        char                            *se_str;
+        struct selinux_audit_rule       *se_rule;
+};
+struct audit_krule {
+        int                     vers_ops;
+        u32                     flags;
+        u32                     listnr;
+        u32                     action;
+        u32                     mask[AUDIT_BITMASK_SIZE];
+        u32                     buflen; /* for data alloc on list rules */
+        u32                     field_count;
+        struct audit_field      *fields;
+};
+struct audit_entry {
+        struct list_head        list;
+        struct rcu_head         rcu;
+        struct audit_krule      rule;
+};
+extern int audit_pid;
+extern int audit_comparator(const u32 left, const u32 op, const u32 right);
+extern void                 audit_send_reply(int pid, int seq, int type,
+                                             int done, int multi,
+                                             void *payload, int size);
+extern void                 audit_log_lost(const char *message);
+extern void                 audit_panic(const char *message);
+extern struct mutex audit_netlink_mutex;
+extern int selinux_audit_rule_update(void);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
new file mode 100644
index 0000000000..7c134906d6
--- /dev/null
+++ b/kernel/auditfilter.c
@@ -0,0 +1,857 @@
+/* auditfilter.c -- filtering of audit events
+ *
+ * Copyright 2003-2004 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/netlink.h>
+#include <linux/selinux.h>
+#include "audit.h"
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
+        LIST_HEAD_INIT(audit_filter_list[0]),
+        LIST_HEAD_INIT(audit_filter_list[1]),
+        LIST_HEAD_INIT(audit_filter_list[2]),
+        LIST_HEAD_INIT(audit_filter_list[3]),
+        LIST_HEAD_INIT(audit_filter_list[4]),
+        LIST_HEAD_INIT(audit_filter_list[5]),
+#if AUDIT_NR_FILTERS != 6
+#error Fix audit_filter_list initialiser
+#endif
+};
+static inline void audit_free_rule(struct audit_entry *e)
+{
+        int i;
+        if (e->rule.fields)
+                for (i = 0; i < e->rule.field_count; i++) {
+                        struct audit_field *f = &e->rule.fields[i];
+                        kfree(f->se_str);
+                        selinux_audit_rule_free(f->se_rule);
+                }
+        kfree(e->rule.fields);
+        kfree(e);
+}
+static inline void audit_free_rule_rcu(struct rcu_head *head)
+{
+        struct audit_entry *e = container_of(head, struct audit_entry, rcu);
+        audit_free_rule(e);
+}
+/* Initialize an audit filterlist entry. */
+static inline struct audit_entry *audit_init_entry(u32 field_count)
+{
+        struct audit_entry *entry;
+        struct audit_field *fields;
+        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+        if (unlikely(!entry))
+                return NULL;
+        fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL);
+        if (unlikely(!fields)) {
+                kfree(entry);
+                return NULL;
+        }
+        entry->rule.fields = fields;
+        return entry;
+}
+/* Unpack a filter field's string representation from user-space
+ * buffer. */
+static char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
+{
+        char *str;
+        if (!*bufp || (len == 0) || (len > *remain))
+                return ERR_PTR(-EINVAL);
+        /* Of the currently implemented string fields, PATH_MAX
+         * defines the longest valid length.
+         */
+        if (len > PATH_MAX)
+                return ERR_PTR(-ENAMETOOLONG);
+        str = kmalloc(len + 1, GFP_KERNEL);
+        if (unlikely(!str))
+                return ERR_PTR(-ENOMEM);
+        memcpy(str, *bufp, len);
+        str[len] = 0;
+        *bufp += len;
+        *remain -= len;
+        return str;
+}
+/* Common user-space to kernel rule translation. */
+static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
+{
+        unsigned listnr;
+        struct audit_entry *entry;
+        int i, err;
+        err = -EINVAL;
+        listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
+        switch(listnr) {
+        default:
+                goto exit_err;
+        case AUDIT_FILTER_USER:
+        case AUDIT_FILTER_TYPE:
+#ifdef CONFIG_AUDITSYSCALL
+        case AUDIT_FILTER_ENTRY:
+        case AUDIT_FILTER_EXIT:
+        case AUDIT_FILTER_TASK:
+#endif
+                ;
+        }
+        if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE &&
+            rule->action != AUDIT_ALWAYS)
+                goto exit_err;
+        if (rule->field_count > AUDIT_MAX_FIELDS)
+                goto exit_err;
+        err = -ENOMEM;
+        entry = audit_init_entry(rule->field_count);
+        if (!entry)
+                goto exit_err;
+        entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
+        entry->rule.listnr = listnr;
+        entry->rule.action = rule->action;
+        entry->rule.field_count = rule->field_count;
+        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+                entry->rule.mask[i] = rule->mask[i];
+        return entry;
+exit_err:
+        return ERR_PTR(err);
+}
+/* Translate struct audit_rule to kernel's rule respresentation.
+ * Exists for backward compatibility with userspace. */
+static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
+{
+        struct audit_entry *entry;
+        int err = 0;
+        int i;
+        entry = audit_to_entry_common(rule);
+        if (IS_ERR(entry))
+                goto exit_nofree;
+        for (i = 0; i < rule->field_count; i++) {
+                struct audit_field *f = &entry->rule.fields[i];
+                f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
+                f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
+                f->val = rule->values[i];
+                if (f->type & AUDIT_UNUSED_BITS ||
+                    f->type == AUDIT_SE_USER ||
+                    f->type == AUDIT_SE_ROLE ||
+                    f->type == AUDIT_SE_TYPE ||
+                    f->type == AUDIT_SE_SEN ||
+                    f->type == AUDIT_SE_CLR) {
+                        err = -EINVAL;
+                        goto exit_free;
+                }
+                entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
+                /* Support for legacy operators where
+                 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
+                if (f->op & AUDIT_NEGATE)
+                        f->op = AUDIT_NOT_EQUAL;
+                else if (!f->op)
+                        f->op = AUDIT_EQUAL;
+                else if (f->op == AUDIT_OPERATORS) {
+                        err = -EINVAL;
+                        goto exit_free;
+                }
+        }
+exit_nofree:
+        return entry;
+exit_free:
+        audit_free_rule(entry);
+        return ERR_PTR(err);
+}
+/* Translate struct audit_rule_data to kernel's rule respresentation. */
+static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
+                                               size_t datasz)
+{
+        int err = 0;
+        struct audit_entry *entry;
+        void *bufp;
+        size_t remain = datasz - sizeof(struct audit_rule_data);
+        int i;
+        char *str;
+        entry = audit_to_entry_common((struct audit_rule *)data);
+        if (IS_ERR(entry))
+                goto exit_nofree;
+        bufp = data->buf;
+        entry->rule.vers_ops = 2;
+        for (i = 0; i < data->field_count; i++) {
+                struct audit_field *f = &entry->rule.fields[i];
+                err = -EINVAL;
+                if (!(data->fieldflags[i] & AUDIT_OPERATORS) ||
+                    data->fieldflags[i] & ~AUDIT_OPERATORS)
+                        goto exit_free;
+                f->op = data->fieldflags[i] & AUDIT_OPERATORS;
+                f->type = data->fields[i];
+                f->val = data->values[i];
+                f->se_str = NULL;
+                f->se_rule = NULL;
+                switch(f->type) {
+                case AUDIT_SE_USER:
+                case AUDIT_SE_ROLE:
+                case AUDIT_SE_TYPE:
+                case AUDIT_SE_SEN:
+                case AUDIT_SE_CLR:
+                        str = audit_unpack_string(&bufp, &remain, f->val);
+                        if (IS_ERR(str))
+                                goto exit_free;
+                        entry->rule.buflen += f->val;
+                        err = selinux_audit_rule_init(f->type, f->op, str,
+                                                      &f->se_rule);
+                        /* Keep currently invalid fields around in case they
+                         * become valid after a policy reload. */
+                        if (err == -EINVAL) {
+                                printk(KERN_WARNING "audit rule for selinux "
+                                       "\'%s\' is invalid\n",  str);
+                                err = 0;
+                        }
+                        if (err) {
+                                kfree(str);
+                                goto exit_free;
+                        } else
+                                f->se_str = str;
+                        break;
+                }
+        }
+exit_nofree:
+        return entry;
+exit_free:
+        audit_free_rule(entry);
+        return ERR_PTR(err);
+}
+/* Pack a filter field's string representation into data block. */
+static inline size_t audit_pack_string(void **bufp, char *str)
+{
+        size_t len = strlen(str);
+        memcpy(*bufp, str, len);
+        *bufp += len;
+        return len;
+}
+/* Translate kernel rule respresentation to struct audit_rule.
+ * Exists for backward compatibility with userspace. */
+static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
+{
+        struct audit_rule *rule;
+        int i;
+        rule = kmalloc(sizeof(*rule), GFP_KERNEL);
+        if (unlikely(!rule))
+                return ERR_PTR(-ENOMEM);
+        memset(rule, 0, sizeof(*rule));
+        rule->flags = krule->flags | krule->listnr;
+        rule->action = krule->action;
+        rule->field_count = krule->field_count;
+        for (i = 0; i < rule->field_count; i++) {
+                rule->values[i] = krule->fields[i].val;
+                rule->fields[i] = krule->fields[i].type;
+                if (krule->vers_ops == 1) {
+                        if (krule->fields[i].op & AUDIT_NOT_EQUAL)
+                                rule->fields[i] |= AUDIT_NEGATE;
+                } else {
+                        rule->fields[i] |= krule->fields[i].op;
+                }
+        }
+        for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
+        return rule;
+}
+/* Translate kernel rule respresentation to struct audit_rule_data. */
+static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
+{
+        struct audit_rule_data *data;
+        void *bufp;
+        int i;
+        data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
+        if (unlikely(!data))
+                return ERR_PTR(-ENOMEM);
+        memset(data, 0, sizeof(*data));
+        data->flags = krule->flags | krule->listnr;
+        data->action = krule->action;
+        data->field_count = krule->field_count;
+        bufp = data->buf;
+        for (i = 0; i < data->field_count; i++) {
+                struct audit_field *f = &krule->fields[i];
+                data->fields[i] = f->type;
+                data->fieldflags[i] = f->op;
+                switch(f->type) {
+                case AUDIT_SE_USER:
+                case AUDIT_SE_ROLE:
+                case AUDIT_SE_TYPE:
+                case AUDIT_SE_SEN:
+                case AUDIT_SE_CLR:
+                        data->buflen += data->values[i] =
+                                audit_pack_string(&bufp, f->se_str);
+                        break;
+                default:
+                        data->values[i] = f->val;
+                }
+        }
+        for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
+        return data;
+}
+/* Compare two rules in kernel format.  Considered success if rules
+ * don't match. */
+static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
+{
+        int i;
+        if (a->flags != b->flags ||
+            a->listnr != b->listnr ||
+            a->action != b->action ||
+            a->field_count != b->field_count)
+                return 1;
+        for (i = 0; i < a->field_count; i++) {
+                if (a->fields[i].type != b->fields[i].type ||
+                    a->fields[i].op != b->fields[i].op)
+                        return 1;
+                switch(a->fields[i].type) {
+                case AUDIT_SE_USER:
+                case AUDIT_SE_ROLE:
+                case AUDIT_SE_TYPE:
+                case AUDIT_SE_SEN:
+                case AUDIT_SE_CLR:
+                        if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
+                                return 1;
+                        break;
+                default:
+                        if (a->fields[i].val != b->fields[i].val)
+                                return 1;
+                }
+        }
+        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+                if (a->mask[i] != b->mask[i])
+                        return 1;
+        return 0;
+}
+/* Duplicate selinux field information.  The se_rule is opaque, so must be
+ * re-initialized. */
+static inline int audit_dupe_selinux_field(struct audit_field *df,
+                                           struct audit_field *sf)
+{
+        int ret = 0;
+        char *se_str;
+        /* our own copy of se_str */
+        se_str = kstrdup(sf->se_str, GFP_KERNEL);
+        if (unlikely(IS_ERR(se_str)))
+            return -ENOMEM;
+        df->se_str = se_str;
+        /* our own (refreshed) copy of se_rule */
+        ret = selinux_audit_rule_init(df->type, df->op, df->se_str,
+                                      &df->se_rule);
+        /* Keep currently invalid fields around in case they
+         * become valid after a policy reload. */
+        if (ret == -EINVAL) {
+                printk(KERN_WARNING "audit rule for selinux \'%s\' is "
+                       "invalid\n", df->se_str);
+                ret = 0;
+        }
+        return ret;
+}
+/* Duplicate an audit rule.  This will be a deep copy with the exception
+ * of the watch - that pointer is carried over.  The selinux specific fields
+ * will be updated in the copy.  The point is to be able to replace the old
+ * rule with the new rule in the filterlist, then free the old rule. */
+static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
+{
+        u32 fcount = old->field_count;
+        struct audit_entry *entry;
+        struct audit_krule *new;
+        int i, err = 0;
+        entry = audit_init_entry(fcount);
+        if (unlikely(!entry))
+                return ERR_PTR(-ENOMEM);
+        new = &entry->rule;
+        new->vers_ops = old->vers_ops;
+        new->flags = old->flags;
+        new->listnr = old->listnr;
+        new->action = old->action;
+        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+                new->mask[i] = old->mask[i];
+        new->buflen = old->buflen;
+        new->field_count = old->field_count;
+        memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
+        /* deep copy this information, updating the se_rule fields, because
+         * the originals will all be freed when the old rule is freed. */
+        for (i = 0; i < fcount; i++) {
+                switch (new->fields[i].type) {
+                case AUDIT_SE_USER:
+                case AUDIT_SE_ROLE:
+                case AUDIT_SE_TYPE:
+                case AUDIT_SE_SEN:
+                case AUDIT_SE_CLR:
+                        err = audit_dupe_selinux_field(&new->fields[i],
+                                                       &old->fields[i]);
+                }
+                if (err) {
+                        audit_free_rule(entry);
+                        return ERR_PTR(err);
+                }
+        }
+        return entry;
+}
+/* Add rule to given filterlist if not a duplicate.  Protected by
+ * audit_netlink_mutex. */
+static inline int audit_add_rule(struct audit_entry *entry,
+                                  struct list_head *list)
+{
+        struct audit_entry *e;
+        /* Do not use the _rcu iterator here, since this is the only
+         * addition routine. */
+        list_for_each_entry(e, list, list) {
+                if (!audit_compare_rule(&entry->rule, &e->rule))
+                        return -EEXIST;
+        }
+        if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
+                list_add_rcu(&entry->list, list);
+        } else {
+                list_add_tail_rcu(&entry->list, list);
+        }
+        return 0;
+}
+/* Remove an existing rule from filterlist.  Protected by
+ * audit_netlink_mutex. */
+static inline int audit_del_rule(struct audit_entry *entry,
+                                 struct list_head *list)
+{
+        struct audit_entry  *e;
+        /* Do not use the _rcu iterator here, since this is the only
+         * deletion routine. */
+        list_for_each_entry(e, list, list) {
+                if (!audit_compare_rule(&entry->rule, &e->rule)) {
+                        list_del_rcu(&e->list);
+                        call_rcu(&e->rcu, audit_free_rule_rcu);
+                        return 0;
+                }
+        }
+        return -ENOENT;         /* No matching rule */
+}
+/* List rules using struct audit_rule.  Exists for backward
+ * compatibility with userspace. */
+static int audit_list(void *_dest)
+{
+        int pid, seq;
+        int *dest = _dest;
+        struct audit_entry *entry;
+        int i;
+        pid = dest[0];
+        seq = dest[1];
+        kfree(dest);
+        mutex_lock(&audit_netlink_mutex);
+        /* The *_rcu iterators not needed here because we are
+           always called with audit_netlink_mutex held. */
+        for (i=0; i<AUDIT_NR_FILTERS; i++) {
+                list_for_each_entry(entry, &audit_filter_list[i], list) {
+                        struct audit_rule *rule;
+                        rule = audit_krule_to_rule(&entry->rule);
+                        if (unlikely(!rule))
+                                break;
+                        audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+                                         rule, sizeof(*rule));
+                        kfree(rule);
+                }
+        }
+        audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+        
+        mutex_unlock(&audit_netlink_mutex);
+        return 0;
+}
+/* List rules using struct audit_rule_data. */
+static int audit_list_rules(void *_dest)
+{
+        int pid, seq;
+        int *dest = _dest;
+        struct audit_entry *e;
+        int i;
+        pid = dest[0];
+        seq = dest[1];
+        kfree(dest);
+        mutex_lock(&audit_netlink_mutex);
+        /* The *_rcu iterators not needed here because we are
+           always called with audit_netlink_mutex held. */
+        for (i=0; i<AUDIT_NR_FILTERS; i++) {
+                list_for_each_entry(e, &audit_filter_list[i], list) {
+                        struct audit_rule_data *data;
+                        data = audit_krule_to_data(&e->rule);
+                        if (unlikely(!data))
+                                break;
+                        audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
+                                         data, sizeof(*data));
+                        kfree(data);
+                }
+        }
+        audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+        mutex_unlock(&audit_netlink_mutex);
+        return 0;
+}
+/**
+ * audit_receive_filter - apply all rules to the specified message type
+ * @type: audit message type
+ * @pid: target pid for netlink audit messages
+ * @uid: target uid for netlink audit messages
+ * @seq: netlink audit message sequence (serial) number
+ * @data: payload data
+ * @datasz: size of payload data
+ * @loginuid: loginuid of sender
+ * @sid: SE Linux Security ID of sender
+ */
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
+                         size_t datasz, uid_t loginuid, u32 sid)
+{
+        struct task_struct *tsk;
+        int *dest;
+        int err = 0;
+        struct audit_entry *entry;
+        switch (type) {
+        case AUDIT_LIST:
+        case AUDIT_LIST_RULES:
+                /* We can't just spew out the rules here because we might fill
+                 * the available socket buffer space and deadlock waiting for
+                 * auditctl to read from it... which isn't ever going to
+                 * happen if we're actually running in the context of auditctl
+                 * trying to _send_ the stuff */
+                 
+                dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
+                if (!dest)
+                        return -ENOMEM;
+                dest[0] = pid;
+                dest[1] = seq;
+                if (type == AUDIT_LIST)
+                        tsk = kthread_run(audit_list, dest, "audit_list");
+                else
+                        tsk = kthread_run(audit_list_rules, dest,
+                                          "audit_list_rules");
+                if (IS_ERR(tsk)) {
+                        kfree(dest);
+                        err = PTR_ERR(tsk);
+                }
+                break;
+        case AUDIT_ADD:
+        case AUDIT_ADD_RULE:
+                if (type == AUDIT_ADD)
+                        entry = audit_rule_to_entry(data);
+                else
+                        entry = audit_data_to_entry(data, datasz);
+                if (IS_ERR(entry))
+                        return PTR_ERR(entry);
+                err = audit_add_rule(entry,
+                                     &audit_filter_list[entry->rule.listnr]);
+                if (sid) {
+                        char *ctx = NULL;
+                        u32 len;
+                        if (selinux_ctxid_to_string(sid, &ctx, &len)) {
+                                /* Maybe call audit_panic? */
+                                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                 "auid=%u ssid=%u add rule to list=%d res=%d",
+                                 loginuid, sid, entry->rule.listnr, !err);
+                        } else
+                                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                 "auid=%u subj=%s add rule to list=%d res=%d",
+                                 loginuid, ctx, entry->rule.listnr, !err);
+                        kfree(ctx);
+                } else
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                "auid=%u add rule to list=%d res=%d",
+                                loginuid, entry->rule.listnr, !err);
+                if (err)
+                        audit_free_rule(entry);
+                break;
+        case AUDIT_DEL:
+        case AUDIT_DEL_RULE:
+                if (type == AUDIT_DEL)
+                        entry = audit_rule_to_entry(data);
+                else
+                        entry = audit_data_to_entry(data, datasz);
+                if (IS_ERR(entry))
+                        return PTR_ERR(entry);
+                err = audit_del_rule(entry,
+                                     &audit_filter_list[entry->rule.listnr]);
+                if (sid) {
+                        char *ctx = NULL;
+                        u32 len;
+                        if (selinux_ctxid_to_string(sid, &ctx, &len)) {
+                                /* Maybe call audit_panic? */
+                                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                        "auid=%u ssid=%u remove rule from list=%d res=%d",
+                                         loginuid, sid, entry->rule.listnr, !err);
+                        } else
+                                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                        "auid=%u subj=%s remove rule from list=%d res=%d",
+                                         loginuid, ctx, entry->rule.listnr, !err);
+                        kfree(ctx);
+                } else
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                "auid=%u remove rule from list=%d res=%d",
+                                loginuid, entry->rule.listnr, !err);
+                audit_free_rule(entry);
+                break;
+        default:
+                return -EINVAL;
+        }
+        return err;
+}
+int audit_comparator(const u32 left, const u32 op, const u32 right)
+{
+        switch (op) {
+        case AUDIT_EQUAL:
+                return (left == right);
+        case AUDIT_NOT_EQUAL:
+                return (left != right);
+        case AUDIT_LESS_THAN:
+                return (left < right);
+        case AUDIT_LESS_THAN_OR_EQUAL:
+                return (left <= right);
+        case AUDIT_GREATER_THAN:
+                return (left > right);
+        case AUDIT_GREATER_THAN_OR_EQUAL:
+                return (left >= right);
+        }
+        BUG();
+        return 0;
+}
+static int audit_filter_user_rules(struct netlink_skb_parms *cb,
+                                   struct audit_krule *rule,
+                                   enum audit_state *state)
+{
+        int i;
+        for (i = 0; i < rule->field_count; i++) {
+                struct audit_field *f = &rule->fields[i];
+                int result = 0;
+                switch (f->type) {
+                case AUDIT_PID:
+                        result = audit_comparator(cb->creds.pid, f->op, f->val);
+                        break;
+                case AUDIT_UID:
+                        result = audit_comparator(cb->creds.uid, f->op, f->val);
+                        break;
+                case AUDIT_GID:
+                        result = audit_comparator(cb->creds.gid, f->op, f->val);
+                        break;
+                case AUDIT_LOGINUID:
+                        result = audit_comparator(cb->loginuid, f->op, f->val);
+                        break;
+                }
+                if (!result)
+                        return 0;
+        }
+        switch (rule->action) {
+        case AUDIT_NEVER:    *state = AUDIT_DISABLED;       break;
+        case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
+        case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+        }
+        return 1;
+}
+int audit_filter_user(struct netlink_skb_parms *cb, int type)
+{
+        struct audit_entry *e;
+        enum audit_state   state;
+        int ret = 1;
+        rcu_read_lock();
+        list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
+                if (audit_filter_user_rules(cb, &e->rule, &state)) {
+                        if (state == AUDIT_DISABLED)
+                                ret = 0;
+                        break;
+                }
+        }
+        rcu_read_unlock();
+        return ret; /* Audit by default */
+}
+int audit_filter_type(int type)
+{
+        struct audit_entry *e;
+        int result = 0;
+        
+        rcu_read_lock();
+        if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
+                goto unlock_and_return;
+        list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
+                                list) {
+                int i;
+                for (i = 0; i < e->rule.field_count; i++) {
+                        struct audit_field *f = &e->rule.fields[i];
+                        if (f->type == AUDIT_MSGTYPE) {
+                                result = audit_comparator(type, f->op, f->val);
+                                if (!result)
+                                        break;
+                        }
+                }
+                if (result)
+                        goto unlock_and_return;
+        }
+unlock_and_return:
+        rcu_read_unlock();
+        return result;
+}
+/* Check to see if the rule contains any selinux fields.  Returns 1 if there
+   are selinux fields specified in the rule, 0 otherwise. */
+static inline int audit_rule_has_selinux(struct audit_krule *rule)
+{
+        int i;
+        for (i = 0; i < rule->field_count; i++) {
+                struct audit_field *f = &rule->fields[i];
+                switch (f->type) {
+                case AUDIT_SE_USER:
+                case AUDIT_SE_ROLE:
+                case AUDIT_SE_TYPE:
+                case AUDIT_SE_SEN:
+                case AUDIT_SE_CLR:
+                        return 1;
+                }
+        }
+        return 0;
+}
+/* This function will re-initialize the se_rule field of all applicable rules.
+ * It will traverse the filter lists serarching for rules that contain selinux
+ * specific filter fields.  When such a rule is found, it is copied, the
+ * selinux field is re-initialized, and the old rule is replaced with the
+ * updated rule. */
+int selinux_audit_rule_update(void)
+{
+        struct audit_entry *entry, *n, *nentry;
+        int i, err = 0;
+        /* audit_netlink_mutex synchronizes the writers */
+        mutex_lock(&audit_netlink_mutex);
+        for (i = 0; i < AUDIT_NR_FILTERS; i++) {
+                list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
+                        if (!audit_rule_has_selinux(&entry->rule))
+                                continue;
+                        nentry = audit_dupe_rule(&entry->rule);
+                        if (unlikely(IS_ERR(nentry))) {
+                                /* save the first error encountered for the
+                                 * return value */
+                                if (!err)
+                                        err = PTR_ERR(nentry);
+                                audit_panic("error updating selinux filters");
+                                list_del_rcu(&entry->list);
+                        } else {
+                                list_replace_rcu(&entry->list, &nentry->list);
+                        }
+                        call_rcu(&entry->rcu, audit_free_rule_rcu);
+                }
+        }
+        mutex_unlock(&audit_netlink_mutex);
+        return err;
+}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 685c25175d..1c03a4ed1b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2,6 +2,8 @@
 * Handles all system-call specific auditing features.
 *
 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright (C) 2005 IBM Corporation
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify
@@ -27,11 +29,22 @@
 * this file -- see entry.S) is based on a GPL'd patch written by
 * okir@suse.de and Copyright 2003 SuSE Linux AG.
 *
+ * The support of additional filter rules compares (>, <, >=, <=) was
+ * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
+ *
+ * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional
+ * filesystem information.
+ *
+ * Subject and object context labeling support added by <danjones@us.ibm.com>
+ * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
 */
 #include <linux/init.h>
 #include <asm/types.h>
 #include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -39,16 +52,17 @@
 #include <linux/audit.h>
 #include <linux/personality.h>
 #include <linux/time.h>
-#include <linux/kthread.h>
 #include <linux/netlink.h>
 #include <linux/compiler.h>
 #include <asm/unistd.h>
+#include <linux/security.h>
+#include <linux/list.h>
+#include <linux/tty.h>
+#include <linux/selinux.h>
+#include "audit.h"
-/* 0 = no checking
+extern struct list_head audit_filter_list[];
-   1 = put_count checking
-   2 = verbose put_count checking
-*/
-#define AUDIT_DEBUG 0
 /* No syscall auditing will take place unless audit_enabled != 0. */
 extern int audit_enabled;
@@ -62,29 +76,6 @@ extern int audit_enabled;
 * path_lookup. */
 #define AUDIT_NAMES_RESERVED 7
-/* At task start time, the audit_state is set in the audit_context using
-   a per-task filter.  At syscall entry, the audit_state is augmented by
-   the syscall filter. */
-enum audit_state {
-        AUDIT_DISABLED,         /* Do not create per-task audit_context.
-                                 * No syscall-specific audit records can
-                                 * be generated. */
-        AUDIT_SETUP_CONTEXT,    /* Create the per-task audit_context,
-                                 * but don't necessarily fill it in at
-                                 * syscall entry time (i.e., filter
-                                 * instead). */
-        AUDIT_BUILD_CONTEXT,    /* Create the per-task audit_context,
-                                 * and always fill it in at syscall
-                                 * entry time.  This makes a full
-                                 * syscall record available if some
-                                 * other part of the kernel decides it
-                                 * should be recorded. */
-        AUDIT_RECORD_CONTEXT    /* Create the per-task audit_context,
-                                 * always fill it in at syscall entry
-                                 * time, and always write out the audit
-                                 * record at syscall exit time.  */
-};
 /* When fs/namei.c:getname() is called, we store the pointer in name and
 * we don't let putname() free it (instead we free all of the saved
 * pointers at syscall exit time).
@@ -93,12 +84,13 @@ enum audit_state {
 struct audit_names {
        const char      *name;
        unsigned long   ino;
+        unsigned long   pino;
        dev_t           dev;
        umode_t         mode;
        uid_t           uid;
        gid_t           gid;
        dev_t           rdev;
-        unsigned        flags;
+        u32             osid;
 };
 struct audit_aux_data {
@@ -115,6 +107,7 @@ struct audit_aux_data_ipcctl {
        uid_t                   uid;
        gid_t                   gid;
        mode_t                  mode;
+        u32                     osid;
 };
 struct audit_aux_data_socketcall {
@@ -167,290 +160,73 @@ struct audit_context {
 #endif
 };
-                                /* Public API */
-/* There are three lists of rules -- one to search at task creation
- * time, one to search at syscall entry time, and another to search at
- * syscall exit time. */
-static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
-        LIST_HEAD_INIT(audit_filter_list[0]),
-        LIST_HEAD_INIT(audit_filter_list[1]),
-        LIST_HEAD_INIT(audit_filter_list[2]),
-        LIST_HEAD_INIT(audit_filter_list[3]),
-        LIST_HEAD_INIT(audit_filter_list[4]),
-#if AUDIT_NR_FILTERS != 5
-#error Fix audit_filter_list initialiser
-#endif
-};
-struct audit_entry {
-        struct list_head  list;
-        struct rcu_head   rcu;
-        struct audit_rule rule;
-};
-extern int audit_pid;
-/* Copy rule from user-space to kernel-space.  Called from 
- * audit_add_rule during AUDIT_ADD. */
-static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
-{
-        int i;
-        if (s->action != AUDIT_NEVER
-            && s->action != AUDIT_POSSIBLE
-            && s->action != AUDIT_ALWAYS)
-                return -1;
-        if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
-                return -1;
-        if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
-                return -1;
-        d->flags        = s->flags;
-        d->action       = s->action;
-        d->field_count  = s->field_count;
-        for (i = 0; i < d->field_count; i++) {
-                d->fields[i] = s->fields[i];
-                d->values[i] = s->values[i];
-        }
-        for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
-        return 0;
-}
-/* Check to see if two rules are identical.  It is called from
- * audit_add_rule during AUDIT_ADD and 
- * audit_del_rule during AUDIT_DEL. */
-static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
-{
-        int i;
-        if (a->flags != b->flags)
-                return 1;
-        if (a->action != b->action)
-                return 1;
-        if (a->field_count != b->field_count)
-                return 1;
-        for (i = 0; i < a->field_count; i++) {
-                if (a->fields[i] != b->fields[i]
-                    || a->values[i] != b->values[i])
-                        return 1;
-        }
-        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
-                if (a->mask[i] != b->mask[i])
-                        return 1;
-        return 0;
-}
-/* Note that audit_add_rule and audit_del_rule are called via
- * audit_receive() in audit.c, and are protected by
- * audit_netlink_sem. */
-static inline int audit_add_rule(struct audit_rule *rule,
-                                  struct list_head *list)
-{
-        struct audit_entry  *entry;
-        /* Do not use the _rcu iterator here, since this is the only
-         * addition routine. */
-        list_for_each_entry(entry, list, list) {
-                if (!audit_compare_rule(rule, &entry->rule)) {
-                        return -EEXIST;
-                }
-        }
-        if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
-                return -ENOMEM;
-        if (audit_copy_rule(&entry->rule, rule)) {
-                kfree(entry);
-                return -EINVAL;
-        }
-        if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
-                entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
-                list_add_rcu(&entry->list, list);
-        } else {
-                list_add_tail_rcu(&entry->list, list);
-        }
-        return 0;
-}
-static inline void audit_free_rule(struct rcu_head *head)
-{
-        struct audit_entry *e = container_of(head, struct audit_entry, rcu);
-        kfree(e);
-}
-/* Note that audit_add_rule and audit_del_rule are called via
- * audit_receive() in audit.c, and are protected by
- * audit_netlink_sem. */
-static inline int audit_del_rule(struct audit_rule *rule,
-                                 struct list_head *list)
-{
-        struct audit_entry  *e;
-        /* Do not use the _rcu iterator here, since this is the only
-         * deletion routine. */
-        list_for_each_entry(e, list, list) {
-                if (!audit_compare_rule(rule, &e->rule)) {
-                        list_del_rcu(&e->list);
-                        call_rcu(&e->rcu, audit_free_rule);
-                        return 0;
-                }
-        }
-        return -ENOENT;         /* No matching rule */
-}
-static int audit_list_rules(void *_dest)
-{
-        int pid, seq;
-        int *dest = _dest;
-        struct audit_entry *entry;
-        int i;
-        pid = dest[0];
-        seq = dest[1];
-        kfree(dest);
-        down(&audit_netlink_sem);
-        /* The *_rcu iterators not needed here because we are
-           always called with audit_netlink_sem held. */
-        for (i=0; i<AUDIT_NR_FILTERS; i++) {
-                list_for_each_entry(entry, &audit_filter_list[i], list)
-                        audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
-                                         &entry->rule, sizeof(entry->rule));
-        }
-        audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
-        
-        up(&audit_netlink_sem);
-        return 0;
-}
-int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
-                                                        uid_t loginuid)
-{
-        struct task_struct *tsk;
-        int *dest;
-        int                err = 0;
-        unsigned listnr;
-        switch (type) {
-        case AUDIT_LIST:
-                /* We can't just spew out the rules here because we might fill
-                 * the available socket buffer space and deadlock waiting for
-                 * auditctl to read from it... which isn't ever going to
-                 * happen if we're actually running in the context of auditctl
-                 * trying to _send_ the stuff */
-                 
-                dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
-                if (!dest)
-                        return -ENOMEM;
-                dest[0] = pid;
-                dest[1] = seq;
-                tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
-                if (IS_ERR(tsk)) {
-                        kfree(dest);
-                        err = PTR_ERR(tsk);
-                }
-                break;
-        case AUDIT_ADD:
-                listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
-                if (listnr >= AUDIT_NR_FILTERS)
-                        return -EINVAL;
-                err = audit_add_rule(data, &audit_filter_list[listnr]);
-                if (!err)
-                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-                                  "auid=%u added an audit rule\n", loginuid);
-                break;
-        case AUDIT_DEL:
-                listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
-                if (listnr >= AUDIT_NR_FILTERS)
-                        return -EINVAL;
-                err = audit_del_rule(data, &audit_filter_list[listnr]);
-                if (!err)
-                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-                                  "auid=%u removed an audit rule\n", loginuid);
-                break;
-        default:
-                return -EINVAL;
-        }
-        return err;
-}
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
 * otherwise. */
 static int audit_filter_rules(struct task_struct *tsk,
-                              struct audit_rule *rule,
+                              struct audit_krule *rule,
                              struct audit_context *ctx,
                              enum audit_state *state)
 {
-        int i, j;
+        int i, j, need_sid = 1;
+        u32 sid;
        for (i = 0; i < rule->field_count; i++) {
-                u32 field  = rule->fields[i] & ~AUDIT_NEGATE;
+                struct audit_field *f = &rule->fields[i];
-                u32 value  = rule->values[i];
                int result = 0;
-                switch (field) {
+                switch (f->type) {
                case AUDIT_PID:
-                        result = (tsk->pid == value);
+                        result = audit_comparator(tsk->pid, f->op, f->val);
                        break;
                case AUDIT_UID:
-                        result = (tsk->uid == value);
+                        result = audit_comparator(tsk->uid, f->op, f->val);
                        break;
                case AUDIT_EUID:
-                        result = (tsk->euid == value);
+                        result = audit_comparator(tsk->euid, f->op, f->val);
                        break;
                case AUDIT_SUID:
-                        result = (tsk->suid == value);
+                        result = audit_comparator(tsk->suid, f->op, f->val);
                        break;
                case AUDIT_FSUID:
-                        result = (tsk->fsuid == value);
+                        result = audit_comparator(tsk->fsuid, f->op, f->val);
                        break;
                case AUDIT_GID:
-                        result = (tsk->gid == value);
+                        result = audit_comparator(tsk->gid, f->op, f->val);
                        break;
                case AUDIT_EGID:
-                        result = (tsk->egid == value);
+                        result = audit_comparator(tsk->egid, f->op, f->val);
                        break;
                case AUDIT_SGID:
-                        result = (tsk->sgid == value);
+                        result = audit_comparator(tsk->sgid, f->op, f->val);
                        break;
                case AUDIT_FSGID:
-                        result = (tsk->fsgid == value);
+                        result = audit_comparator(tsk->fsgid, f->op, f->val);
                        break;
                case AUDIT_PERS:
-                        result = (tsk->personality == value);
+                        result = audit_comparator(tsk->personality, f->op, f->val);
                        break;
                case AUDIT_ARCH:
-                        if (ctx) 
+                        if (ctx)
-                                result = (ctx->arch == value);
+                                result = audit_comparator(ctx->arch, f->op, f->val);
                        break;
                case AUDIT_EXIT:
                        if (ctx && ctx->return_valid)
-                                result = (ctx->return_code == value);
+                                result = audit_comparator(ctx->return_code, f->op, f->val);
                        break;
                case AUDIT_SUCCESS:
                        if (ctx && ctx->return_valid) {
-                                if (value)
+                                if (f->val)
-                                        result = (ctx->return_valid == AUDITSC_SUCCESS);
+                                        result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
                                else
-                                        result = (ctx->return_valid == AUDITSC_FAILURE);
+                                        result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE);
                        }
                        break;
                case AUDIT_DEVMAJOR:
                        if (ctx) {
                                for (j = 0; j < ctx->name_count; j++) {
-                                        if (MAJOR(ctx->names[j].dev)==value) {
+                                        if (audit_comparator(MAJOR(ctx->names[j].dev),  f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -460,7 +236,7 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_DEVMINOR:
                        if (ctx) {
                                for (j = 0; j < ctx->name_count; j++) {
-                                        if (MINOR(ctx->names[j].dev)==value) {
+                                        if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -470,7 +246,8 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_INODE:
                        if (ctx) {
                                for (j = 0; j < ctx->name_count; j++) {
-                                        if (ctx->names[j].ino == value) {
+                                        if (audit_comparator(ctx->names[j].ino, f->op, f->val) ||
+                                            audit_comparator(ctx->names[j].pino, f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -480,19 +257,38 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_LOGINUID:
                        result = 0;
                        if (ctx)
-                                result = (ctx->loginuid == value);
+                                result = audit_comparator(ctx->loginuid, f->op, f->val);
+                        break;
+                case AUDIT_SE_USER:
+                case AUDIT_SE_ROLE:
+                case AUDIT_SE_TYPE:
+                case AUDIT_SE_SEN:
+                case AUDIT_SE_CLR:
+                        /* NOTE: this may return negative values indicating
+                           a temporary error.  We simply treat this as a
+                           match for now to avoid losing information that
+                           may be wanted.   An error message will also be
+                           logged upon error */
+                        if (f->se_rule) {
+                                if (need_sid) {
+                                        selinux_task_ctxid(tsk, &sid);
+                                        need_sid = 0;
+                                }
+                                result = selinux_audit_rule_match(sid, f->type,
+                                                                  f->op,
+                                                                  f->se_rule,
+                                                                  ctx);
+                        }
                        break;
                case AUDIT_ARG0:
                case AUDIT_ARG1:
                case AUDIT_ARG2:
                case AUDIT_ARG3:
                        if (ctx)
-                                result = (ctx->argv[field-AUDIT_ARG0]==value);
+                                result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
                        break;
                }
-                if (rule->fields[i] & AUDIT_NEGATE)
-                        result = !result;
                if (!result)
                        return 0;
        }
@@ -527,7 +323,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
 /* At syscall entry and exit time, this filter is called if the
 * audit_state is not low enough that auditing cannot take place, but is
 * also not high enough that we already know we have to write an audit
- * record (i.e., the state is AUDIT_SETUP_CONTEXT or  AUDIT_BUILD_CONTEXT).
+ * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
 */
 static enum audit_state audit_filter_syscall(struct task_struct *tsk,
                                             struct audit_context *ctx,
@@ -541,80 +337,21 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
        rcu_read_lock();
        if (!list_empty(list)) {
-                    int word = AUDIT_WORD(ctx->major);
+                int word = AUDIT_WORD(ctx->major);
-                    int bit  = AUDIT_BIT(ctx->major);
+                int bit  = AUDIT_BIT(ctx->major);
-                    list_for_each_entry_rcu(e, list, list) {
+                list_for_each_entry_rcu(e, list, list) {
-                            if ((e->rule.mask[word] & bit) == bit
+                        if ((e->rule.mask[word] & bit) == bit
-                                && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+                                        && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
-                                    rcu_read_unlock();
+                                rcu_read_unlock();
-                                    return state;
+                                return state;
-                            }
+                        }
-                    }
-        }
-        rcu_read_unlock();
-        return AUDIT_BUILD_CONTEXT;
-}
-static int audit_filter_user_rules(struct netlink_skb_parms *cb,
-                              struct audit_rule *rule,
-                              enum audit_state *state)
-{
-        int i;
-        for (i = 0; i < rule->field_count; i++) {
-                u32 field  = rule->fields[i] & ~AUDIT_NEGATE;
-                u32 value  = rule->values[i];
-                int result = 0;
-                switch (field) {
-                case AUDIT_PID:
-                        result = (cb->creds.pid == value);
-                        break;
-                case AUDIT_UID:
-                        result = (cb->creds.uid == value);
-                        break;
-                case AUDIT_GID:
-                        result = (cb->creds.gid == value);
-                        break;
-                case AUDIT_LOGINUID:
-                        result = (cb->loginuid == value);
-                        break;
-                }
-                if (rule->fields[i] & AUDIT_NEGATE)
-                        result = !result;
-                if (!result)
-                        return 0;
-        }
-        switch (rule->action) {
-        case AUDIT_NEVER:    *state = AUDIT_DISABLED;       break;
-        case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
-        case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
-        }
-        return 1;
-}
-int audit_filter_user(struct netlink_skb_parms *cb, int type)
-{
-        struct audit_entry *e;
-        enum audit_state   state;
-        int ret = 1;
-        rcu_read_lock();
-        list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
-                if (audit_filter_user_rules(cb, &e->rule, &state)) {
-                        if (state == AUDIT_DISABLED)
-                                ret = 0;
-                        break;
                }
        }
        rcu_read_unlock();
+        return AUDIT_BUILD_CONTEXT;
-        return ret; /* Audit by default */
 }
-/* This should be called with task_lock() held. */
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
                                                      int return_valid,
                                                      int return_code)
@@ -654,17 +391,18 @@ static inline void audit_free_names(struct audit_context *context)
 #if AUDIT_DEBUG == 2
        if (context->auditable
            ||context->put_count + context->ino_count != context->name_count) {
-                printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d"
+                printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
                       " name_count=%d put_count=%d"
                       " ino_count=%d [NOT freeing]\n",
-                       __LINE__,
+                       __FILE__, __LINE__,
                       context->serial, context->major, context->in_syscall,
                       context->name_count, context->put_count,
                       context->ino_count);
-                for (i = 0; i < context->name_count; i++)
+                for (i = 0; i < context->name_count; i++) {
                        printk(KERN_ERR "names[%d] = %p = %s\n", i,
                               context->names[i].name,
-                               context->names[i].name);
+                               context->names[i].name ?: "(null)");
+                }
                dump_stack();
                return;
        }
@@ -674,9 +412,10 @@ static inline void audit_free_names(struct audit_context *context)
        context->ino_count  = 0;
 #endif
-        for (i = 0; i < context->name_count; i++)
+        for (i = 0; i < context->name_count; i++) {
                if (context->names[i].name)
                        __putname(context->names[i].name);
+        }
        context->name_count = 0;
        if (context->pwd)
                dput(context->pwd);
@@ -696,6 +435,7 @@ static inline void audit_free_aux(struct audit_context *context)
                        dput(axi->dentry);
                        mntput(axi->mnt);
                }
                context->aux = aux->next;
                kfree(aux);
        }
@@ -721,10 +461,15 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
        return context;
 }
-/* Filter on the task information and allocate a per-task audit context
+/**
+ * audit_alloc - allocate an audit context block for a task
+ * @tsk: task
+ *
+ * Filter on the task information and allocate a per-task audit context
 * if necessary.  Doing so turns on system call auditing for the
 * specified task.  This is called from copy_process, so no lock is
- * needed. */
+ * needed.
+ */
 int audit_alloc(struct task_struct *tsk)
 {
        struct audit_context *context;
@@ -775,41 +520,76 @@ static inline void audit_free_context(struct audit_context *context)
                printk(KERN_ERR "audit: freed %d contexts\n", count);
 }
-static void audit_log_task_info(struct audit_buffer *ab)
+static void audit_log_task_context(struct audit_buffer *ab)
 {
-        char name[sizeof(current->comm)];
+        char *ctx = NULL;
-        struct mm_struct *mm = current->mm;
+        ssize_t len = 0;
+        len = security_getprocattr(current, "current", NULL, 0);
+        if (len < 0) {
+                if (len != -EINVAL)
+                        goto error_path;
+                return;
+        }
+        ctx = kmalloc(len, GFP_KERNEL);
+        if (!ctx)
+                goto error_path;
+        len = security_getprocattr(current, "current", ctx, len);
+        if (len < 0 )
+                goto error_path;
+        audit_log_format(ab, " subj=%s", ctx);
+        return;
+error_path:
+        if (ctx)
+                kfree(ctx);
+        audit_panic("error in audit_log_task_context");
+        return;
+}
+static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+{
+        char name[sizeof(tsk->comm)];
+        struct mm_struct *mm = tsk->mm;
        struct vm_area_struct *vma;
-        get_task_comm(name, current);
+        /* tsk == current */
+        get_task_comm(name, tsk);
        audit_log_format(ab, " comm=");
        audit_log_untrustedstring(ab, name);
-        if (!mm)
+        if (mm) {
-                return;
+                down_read(&mm->mmap_sem);
+                vma = mm->mmap;
-        down_read(&mm->mmap_sem);
+                while (vma) {
-        vma = mm->mmap;
+                        if ((vma->vm_flags & VM_EXECUTABLE) &&
-        while (vma) {
+                            vma->vm_file) {
-                if ((vma->vm_flags & VM_EXECUTABLE) &&
+                                audit_log_d_path(ab, "exe=",
-                    vma->vm_file) {
+                                                 vma->vm_file->f_dentry,
-                        audit_log_d_path(ab, "exe=",
+                                                 vma->vm_file->f_vfsmnt);
-                                         vma->vm_file->f_dentry,
+                                break;
-                                         vma->vm_file->f_vfsmnt);
+                        }
-                        break;
+                        vma = vma->vm_next;
                }
-                vma = vma->vm_next;
+                up_read(&mm->mmap_sem);
        }
-        up_read(&mm->mmap_sem);
+        audit_log_task_context(ab);
 }
-static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
+static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
-        int i;
+        int i, call_panic = 0;
        struct audit_buffer *ab;
        struct audit_aux_data *aux;
+        const char *tty;
-        ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL);
+        /* tsk == current */
+        ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
        if (!ab)
                return;         /* audit_panic has been called */
        audit_log_format(ab, "arch=%x syscall=%d",
@@ -820,11 +600,15 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
                audit_log_format(ab, " success=%s exit=%ld", 
                                 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
                                 context->return_code);
+        if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+                tty = tsk->signal->tty->name;
+        else
+                tty = "(none)";
        audit_log_format(ab,
                  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
                  " pid=%d auid=%u uid=%u gid=%u"
                  " euid=%u suid=%u fsuid=%u"
-                  " egid=%u sgid=%u fsgid=%u",
+                  " egid=%u sgid=%u fsgid=%u tty=%s",
                  context->argv[0],
                  context->argv[1],
                  context->argv[2],
@@ -835,8 +619,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
                  context->uid,
                  context->gid,
                  context->euid, context->suid, context->fsuid,
-                  context->egid, context->sgid, context->fsgid);
+                  context->egid, context->sgid, context->fsgid, tty);
-        audit_log_task_info(ab);
+        audit_log_task_info(ab, tsk);
        audit_log_end(ab);
        for (aux = context->aux; aux; aux = aux->next) {
@@ -849,8 +633,39 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
                case AUDIT_IPC: {
                        struct audit_aux_data_ipcctl *axi = (void *)aux;
                        audit_log_format(ab, 
-                                         " qbytes=%lx iuid=%u igid=%u mode=%x",
+                                 " qbytes=%lx iuid=%u igid=%u mode=%x",
-                                         axi->qbytes, axi->uid, axi->gid, axi->mode);
+                                 axi->qbytes, axi->uid, axi->gid, axi->mode);
+                        if (axi->osid != 0) {
+                                char *ctx = NULL;
+                                u32 len;
+                                if (selinux_ctxid_to_string(
+                                                axi->osid, &ctx, &len)) {
+                                        audit_log_format(ab, " osid=%u",
+                                                        axi->osid);
+                                        call_panic = 1;
+                                } else
+                                        audit_log_format(ab, " obj=%s", ctx);
+                                kfree(ctx);
+                        }
+                        break; }
+                case AUDIT_IPC_SET_PERM: {
+                        struct audit_aux_data_ipcctl *axi = (void *)aux;
+                        audit_log_format(ab,
+                                " new qbytes=%lx new iuid=%u new igid=%u new mode=%x",
+                                axi->qbytes, axi->uid, axi->gid, axi->mode);
+                        if (axi->osid != 0) {
+                                char *ctx = NULL;
+                                u32 len;
+                                if (selinux_ctxid_to_string(
+                                                axi->osid, &ctx, &len)) {
+                                        audit_log_format(ab, " osid=%u",
+                                                        axi->osid);
+                                        call_panic = 1;
+                                } else
+                                        audit_log_format(ab, " obj=%s", ctx);
+                                kfree(ctx);
+                        }
                        break; }
                case AUDIT_SOCKETCALL: {
@@ -885,42 +700,65 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
                }
        }
        for (i = 0; i < context->name_count; i++) {
+                unsigned long ino  = context->names[i].ino;
+                unsigned long pino = context->names[i].pino;
                ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
                if (!ab)
                        continue; /* audit_panic has been called */
                audit_log_format(ab, "item=%d", i);
-                if (context->names[i].name) {
-                        audit_log_format(ab, " name=");
+                audit_log_format(ab, " name=");
+                if (context->names[i].name)
                        audit_log_untrustedstring(ab, context->names[i].name);
-                }
+                else
-                audit_log_format(ab, " flags=%x\n", context->names[i].flags);
+                        audit_log_format(ab, "(null)");
-                         
-                if (context->names[i].ino != (unsigned long)-1)
+                if (pino != (unsigned long)-1)
-                        audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
+                        audit_log_format(ab, " parent=%lu",  pino);
-                                             " ouid=%u ogid=%u rdev=%02x:%02x",
+                if (ino != (unsigned long)-1)
-                                         context->names[i].ino,
+                        audit_log_format(ab, " inode=%lu",  ino);
-                                         MAJOR(context->names[i].dev),
+                if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1))
-                                         MINOR(context->names[i].dev),
+                        audit_log_format(ab, " dev=%02x:%02x mode=%#o" 
-                                         context->names[i].mode,
+                                         " ouid=%u ogid=%u rdev=%02x:%02x", 
-                                         context->names[i].uid,
+                                         MAJOR(context->names[i].dev), 
-                                         context->names[i].gid,
+                                         MINOR(context->names[i].dev), 
-                                         MAJOR(context->names[i].rdev),
+                                         context->names[i].mode, 
+                                         context->names[i].uid, 
+                                         context->names[i].gid, 
+                                         MAJOR(context->names[i].rdev), 
                                         MINOR(context->names[i].rdev));
+                if (context->names[i].osid != 0) {
+                        char *ctx = NULL;
+                        u32 len;
+                        if (selinux_ctxid_to_string(
+                                context->names[i].osid, &ctx, &len)) {
+                                audit_log_format(ab, " osid=%u",
+                                                context->names[i].osid);
+                                call_panic = 2;
+                        } else
+                                audit_log_format(ab, " obj=%s", ctx);
+                        kfree(ctx);
+                }
                audit_log_end(ab);
        }
+        if (call_panic)
+                audit_panic("error converting sid to string");
 }
-/* Free a per-task audit context.  Called from copy_process and
+/**
- * __put_task_struct. */
+ * audit_free - free a per-task audit context
+ * @tsk: task whose audit context block to free
+ *
+ * Called from copy_process and do_exit
+ */
 void audit_free(struct task_struct *tsk)
 {
        struct audit_context *context;
-        task_lock(tsk);
        context = audit_get_context(tsk, 0, 0);
-        task_unlock(tsk);
        if (likely(!context))
                return;
@@ -928,29 +766,43 @@ void audit_free(struct task_struct *tsk)
         * function (e.g., exit_group), then free context block. 
         * We use GFP_ATOMIC here because we might be doing this 
         * in the context of the idle thread */
+        /* that can happen only if we are called from do_exit() */
        if (context->in_syscall && context->auditable)
-                audit_log_exit(context, GFP_ATOMIC);
+                audit_log_exit(context, tsk);
        audit_free_context(context);
 }
-/* Fill in audit context at syscall entry.  This only happens if the
+/**
+ * audit_syscall_entry - fill in an audit record at syscall entry
+ * @tsk: task being audited
+ * @arch: architecture type
+ * @major: major syscall type (function)
+ * @a1: additional syscall register 1
+ * @a2: additional syscall register 2
+ * @a3: additional syscall register 3
+ * @a4: additional syscall register 4
+ *
+ * Fill in audit context at syscall entry.  This only happens if the
 * audit context was created when the task was created and the state or
 * filters demand the audit context be built.  If the state from the
 * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
 * then the record will be written at syscall exit time (otherwise, it
 * will only be written if another part of the kernel requests that it
- * be written). */
+ * be written).
-void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
+ */
+void audit_syscall_entry(int arch, int major,
                         unsigned long a1, unsigned long a2,
                         unsigned long a3, unsigned long a4)
 {
+        struct task_struct *tsk = current;
        struct audit_context *context = tsk->audit_context;
        enum audit_state     state;
        BUG_ON(!context);
-        /* This happens only on certain architectures that make system
+        /*
+         * This happens only on certain architectures that make system
         * calls in kernel_thread via the entry.S interface, instead of
         * with direct calls.  (If you are porting to a new
         * architecture, hitting this condition can indicate that you
@@ -958,7 +810,7 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
         *
         * i386     no
         * x86_64   no
-         * ppc64    yes (see arch/ppc64/kernel/misc.S)
+         * ppc64    yes (see arch/powerpc/platforms/iseries/misc.S)
         *
         * This also happens with vm86 emulation in a non-nested manner
         * (entries without exits), so this case must be caught.
@@ -966,11 +818,6 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
        if (context->in_syscall) {
                struct audit_context *newctx;
-#if defined(__NR_vm86) && defined(__NR_vm86old)
-                /* vm86 mode should only be entered once */
-                if (major == __NR_vm86 || major == __NR_vm86old)
-                        return;
-#endif
 #if AUDIT_DEBUG
                printk(KERN_ERR
                       "audit(:%d) pid=%d in syscall=%d;"
@@ -1014,27 +861,30 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
        context->auditable  = !!(state == AUDIT_RECORD_CONTEXT);
 }
-/* Tear down after system call.  If the audit context has been marked as
+/**
+ * audit_syscall_exit - deallocate audit context after a system call
+ * @tsk: task being audited
+ * @valid: success/failure flag
+ * @return_code: syscall return value
+ *
+ * Tear down after system call.  If the audit context has been marked as
 * auditable (either because of the AUDIT_RECORD_CONTEXT state from
 * filtering, or because some other part of the kernel write an audit
 * message), then write out the syscall information.  In call cases,
- * free the names stored from getname(). */
+ * free the names stored from getname().
-void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
+ */
+void audit_syscall_exit(int valid, long return_code)
 {
+        struct task_struct *tsk = current;
        struct audit_context *context;
-        get_task_struct(tsk);
-        task_lock(tsk);
        context = audit_get_context(tsk, valid, return_code);
-        task_unlock(tsk);
-        /* Not having a context here is ok, since the parent may have
-         * called __put_task_struct. */
        if (likely(!context))
-                goto out;
+                return;
        if (context->in_syscall && context->auditable)
-                audit_log_exit(context, GFP_KERNEL);
+                audit_log_exit(context, tsk);
        context->in_syscall = 0;
        context->auditable  = 0;
@@ -1049,11 +899,15 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
                audit_free_aux(context);
                tsk->audit_context = context;
        }
- out:
-        put_task_struct(tsk);
 }
-/* Add a name to the list.  Called from fs/namei.c:getname(). */
+/**
+ * audit_getname - add a name to the list
+ * @name: name to add
+ *
+ * Add a name to the list of audit names for this context.
+ * Called from fs/namei.c:getname().
+ */
 void audit_getname(const char *name)
 {
        struct audit_context *context = current->audit_context;
@@ -1082,10 +936,13 @@ void audit_getname(const char *name)
                
 }
-/* Intercept a putname request.  Called from
+/* audit_putname - intercept a putname request
- * include/linux/fs.h:putname().  If we have stored the name from
+ * @name: name to intercept and delay for putname
- * getname in the audit context, then we delay the putname until syscall
+ *
- * exit. */
+ * If we have stored the name from getname in the audit context,
+ * then we delay the putname until syscall exit.
+ * Called from include/linux/fs.h:putname().
+ */
 void audit_putname(const char *name)
 {
        struct audit_context *context = current->audit_context;
@@ -1100,7 +957,7 @@ void audit_putname(const char *name)
                        for (i = 0; i < context->name_count; i++)
                                printk(KERN_ERR "name[%d] = %p = %s\n", i,
                                       context->names[i].name,
-                                       context->names[i].name);
+                                       context->names[i].name ?: "(null)");
                }
 #endif
                __putname(name);
@@ -1122,9 +979,23 @@ void audit_putname(const char *name)
 #endif
 }
-/* Store the inode and device from a lookup.  Called from
+static void audit_inode_context(int idx, const struct inode *inode)
- * fs/namei.c:path_lookup(). */
+{
-void audit_inode(const char *name, const struct inode *inode, unsigned flags)
+        struct audit_context *context = current->audit_context;
+        selinux_get_inode_sid(inode, &context->names[idx].osid);
+}
+/**
+ * audit_inode - store the inode and device from a lookup
+ * @name: name being audited
+ * @inode: inode being audited
+ * @flags: lookup flags (as used in path_lookup())
+ *
+ * Called from fs/namei.c:path_lookup().
+ */
+void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
 {
        int idx;
        struct audit_context *context = current->audit_context;
@@ -1150,15 +1021,105 @@ void audit_inode(const char *name, const struct inode *inode, unsigned flags)
                ++context->ino_count;
 #endif
        }
-        context->names[idx].flags = flags;
-        context->names[idx].ino   = inode->i_ino;
        context->names[idx].dev   = inode->i_sb->s_dev;
        context->names[idx].mode  = inode->i_mode;
        context->names[idx].uid   = inode->i_uid;
        context->names[idx].gid   = inode->i_gid;
        context->names[idx].rdev  = inode->i_rdev;
+        audit_inode_context(idx, inode);
+        if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && 
+            (strcmp(name, ".") != 0)) {
+                context->names[idx].ino   = (unsigned long)-1;
+                context->names[idx].pino  = inode->i_ino;
+        } else {
+                context->names[idx].ino   = inode->i_ino;
+                context->names[idx].pino  = (unsigned long)-1;
+        }
 }
+/**
+ * audit_inode_child - collect inode info for created/removed objects
+ * @dname: inode's dentry name
+ * @inode: inode being audited
+ * @pino: inode number of dentry parent
+ *
+ * For syscalls that create or remove filesystem objects, audit_inode
+ * can only collect information for the filesystem object's parent.
+ * This call updates the audit context with the child's information.
+ * Syscalls that create a new filesystem object must be hooked after
+ * the object is created.  Syscalls that remove a filesystem object
+ * must be hooked prior, in order to capture the target inode during
+ * unsuccessful attempts.
+ */
+void __audit_inode_child(const char *dname, const struct inode *inode,
+                         unsigned long pino)
+{
+        int idx;
+        struct audit_context *context = current->audit_context;
+        if (!context->in_syscall)
+                return;
+        /* determine matching parent */
+        if (dname)
+                for (idx = 0; idx < context->name_count; idx++)
+                        if (context->names[idx].pino == pino) {
+                                const char *n;
+                                const char *name = context->names[idx].name;
+                                int dlen = strlen(dname);
+                                int nlen = name ? strlen(name) : 0;
+                                if (nlen < dlen)
+                                        continue;
+                                
+                                /* disregard trailing slashes */
+                                n = name + nlen - 1;
+                                while ((*n == '/') && (n > name))
+                                        n--;
+                                /* find last path component */
+                                n = n - dlen + 1;
+                                if (n < name)
+                                        continue;
+                                else if (n > name) {
+                                        if (*--n != '/')
+                                                continue;
+                                        else
+                                                n++;
+                                }
+                                if (strncmp(n, dname, dlen) == 0)
+                                        goto update_context;
+                        }
+        /* catch-all in case match not found */
+        idx = context->name_count++;
+        context->names[idx].name  = NULL;
+        context->names[idx].pino  = pino;
+#if AUDIT_DEBUG
+        context->ino_count++;
+#endif
+update_context:
+        if (inode) {
+                context->names[idx].ino   = inode->i_ino;
+                context->names[idx].dev   = inode->i_sb->s_dev;
+                context->names[idx].mode  = inode->i_mode;
+                context->names[idx].uid   = inode->i_uid;
+                context->names[idx].gid   = inode->i_gid;
+                context->names[idx].rdev  = inode->i_rdev;
+                audit_inode_context(idx, inode);
+        }
+}
+/**
+ * auditsc_get_stamp - get local copies of audit_context values
+ * @ctx: audit_context for the task
+ * @t: timespec to store time recorded in the audit_context
+ * @serial: serial value that is recorded in the audit_context
+ *
+ * Also sets the context as auditable.
+ */
 void auditsc_get_stamp(struct audit_context *ctx,
                       struct timespec *t, unsigned int *serial)
 {
@@ -1170,6 +1131,15 @@ void auditsc_get_stamp(struct audit_context *ctx,
        ctx->auditable = 1;
 }
+/**
+ * audit_set_loginuid - set a task's audit_context loginuid
+ * @task: task whose audit context is being modified
+ * @loginuid: loginuid value
+ *
+ * Returns 0.
+ *
+ * Called (set) from fs/proc/base.c::proc_loginuid_write().
+ */
 int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 {
        if (task->audit_context) {
@@ -1188,12 +1158,24 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
        return 0;
 }
+/**
+ * audit_get_loginuid - get the loginuid for an audit_context
+ * @ctx: the audit_context
+ *
+ * Returns the context's loginuid or -1 if @ctx is NULL.
+ */
 uid_t audit_get_loginuid(struct audit_context *ctx)
 {
        return ctx ? ctx->loginuid : -1;
 }
-int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+/**
+ * audit_ipc_obj - record audit data for ipc object
+ * @ipcp: ipc permissions
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
        struct audit_aux_data_ipcctl *ax;
        struct audit_context *context = current->audit_context;
@@ -1201,7 +1183,39 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
        if (likely(!context))
                return 0;
-        ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+        ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+        if (!ax)
+                return -ENOMEM;
+        ax->uid = ipcp->uid;
+        ax->gid = ipcp->gid;
+        ax->mode = ipcp->mode;
+        selinux_get_ipc_sid(ipcp, &ax->osid);
+        ax->d.type = AUDIT_IPC;
+        ax->d.next = context->aux;
+        context->aux = (void *)ax;
+        return 0;
+}
+/**
+ * audit_ipc_set_perm - record audit data for new ipc permissions
+ * @qbytes: msgq bytes
+ * @uid: msgq user id
+ * @gid: msgq group id
+ * @mode: msgq mode (permissions)
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
+{
+        struct audit_aux_data_ipcctl *ax;
+        struct audit_context *context = current->audit_context;
+        if (likely(!context))
+                return 0;
+        ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
        if (!ax)
                return -ENOMEM;
@@ -1209,13 +1223,21 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
        ax->uid = uid;
        ax->gid = gid;
        ax->mode = mode;
+        selinux_get_ipc_sid(ipcp, &ax->osid);
-        ax->d.type = AUDIT_IPC;
+        ax->d.type = AUDIT_IPC_SET_PERM;
        ax->d.next = context->aux;
        context->aux = (void *)ax;
        return 0;
 }
+/**
+ * audit_socketcall - record audit data for sys_socketcall
+ * @nargs: number of args
+ * @args: args array
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
 int audit_socketcall(int nargs, unsigned long *args)
 {
        struct audit_aux_data_socketcall *ax;
@@ -1237,6 +1259,13 @@ int audit_socketcall(int nargs, unsigned long *args)
        return 0;
 }
+/**
+ * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
+ * @len: data length in user space
+ * @a: data address in kernel space
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
 int audit_sockaddr(int len, void *a)
 {
        struct audit_aux_data_sockaddr *ax;
@@ -1258,6 +1287,15 @@ int audit_sockaddr(int len, void *a)
        return 0;
 }
+/**
+ * audit_avc_path - record the granting or denial of permissions
+ * @dentry: dentry to record
+ * @mnt: mnt to record
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ *
+ * Called from security/selinux/avc.c::avc_audit()
+ */
 int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
 {
        struct audit_aux_data_path *ax;
@@ -1279,6 +1317,14 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
        return 0;
 }
+/**
+ * audit_signal_info - record signal info for shutting down audit subsystem
+ * @sig: signal value
+ * @t: task being signaled
+ *
+ * If the audit subsystem is being terminated, record the task (pid)
+ * and uid that is doing that.
+ */
 void audit_signal_info(int sig, struct task_struct *t)
 {
        extern pid_t audit_sig_pid;
@@ -1295,4 +1341,3 @@ void audit_signal_info(int sig, struct task_struct *t)
                }
        }
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index bfa3c92e16..1a4d8a40d3 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -233,3 +233,19 @@ out:
     return ret;
 }
+int __capable(struct task_struct *t, int cap)
+{
+        if (security_capable(t, cap) == 0) {
+                t->flags |= PF_SUPERPRIV;
+                return 1;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(__capable);
+int capable(int cap)
+{
+        return __capable(current, cap);
+}
+EXPORT_SYMBOL(capable);
diff --git a/kernel/compat.c b/kernel/compat.c
index 8c9cd88b67..c1601a84f8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -17,10 +17,10 @@
 #include <linux/time.h>
 #include <linux/signal.h>
 #include <linux/sched.h>        /* for MAX_SCHEDULE_TIMEOUT */
-#include <linux/futex.h>        /* for FUTEX_WAIT */
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/security.h>
+#include <linux/timex.h>
 #include <asm/uaccess.h>
@@ -238,28 +238,6 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
        return ret;
 }
-#ifdef CONFIG_FUTEX
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
-                struct compat_timespec __user *utime, u32 __user *uaddr2,
-                int val3)
-{
-        struct timespec t;
-        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
-        int val2 = 0;
-        if ((op == FUTEX_WAIT) && utime) {
-                if (get_compat_timespec(&t, utime))
-                        return -EFAULT;
-                timeout = timespec_to_jiffies(&t) + 1;
-        }
-        if (op >= FUTEX_REQUEUE)
-                val2 = (int) (unsigned long) utime;
-        return do_futex((unsigned long)uaddr, op, val, timeout,
-                        (unsigned long)uaddr2, val2, val3);
-}
-#endif
 asmlinkage long compat_sys_setrlimit(unsigned int resource,
                struct compat_rlimit __user *rlim)
 {
@@ -898,3 +876,61 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
        return -ERESTARTNOHAND;
 }
 #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
+asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
+{
+        struct timex txc;
+        int ret;
+        memset(&txc, 0, sizeof(struct timex));
+        if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+                        __get_user(txc.modes, &utp->modes) ||
+                        __get_user(txc.offset, &utp->offset) ||
+                        __get_user(txc.freq, &utp->freq) ||
+                        __get_user(txc.maxerror, &utp->maxerror) ||
+                        __get_user(txc.esterror, &utp->esterror) ||
+                        __get_user(txc.status, &utp->status) ||
+                        __get_user(txc.constant, &utp->constant) ||
+                        __get_user(txc.precision, &utp->precision) ||
+                        __get_user(txc.tolerance, &utp->tolerance) ||
+                        __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+                        __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+                        __get_user(txc.tick, &utp->tick) ||
+                        __get_user(txc.ppsfreq, &utp->ppsfreq) ||
+                        __get_user(txc.jitter, &utp->jitter) ||
+                        __get_user(txc.shift, &utp->shift) ||
+                        __get_user(txc.stabil, &utp->stabil) ||
+                        __get_user(txc.jitcnt, &utp->jitcnt) ||
+                        __get_user(txc.calcnt, &utp->calcnt) ||
+                        __get_user(txc.errcnt, &utp->errcnt) ||
+                        __get_user(txc.stbcnt, &utp->stbcnt))
+                return -EFAULT;
+        ret = do_adjtimex(&txc);
+        if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+                        __put_user(txc.modes, &utp->modes) ||
+                        __put_user(txc.offset, &utp->offset) ||
+                        __put_user(txc.freq, &utp->freq) ||
+                        __put_user(txc.maxerror, &utp->maxerror) ||
+                        __put_user(txc.esterror, &utp->esterror) ||
+                        __put_user(txc.status, &utp->status) ||
+                        __put_user(txc.constant, &utp->constant) ||
+                        __put_user(txc.precision, &utp->precision) ||
+                        __put_user(txc.tolerance, &utp->tolerance) ||
+                        __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+                        __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+                        __put_user(txc.tick, &utp->tick) ||
+                        __put_user(txc.ppsfreq, &utp->ppsfreq) ||
+                        __put_user(txc.jitter, &utp->jitter) ||
+                        __put_user(txc.shift, &utp->shift) ||
+                        __put_user(txc.stabil, &utp->stabil) ||
+                        __put_user(txc.jitcnt, &utp->jitcnt) ||
+                        __put_user(txc.calcnt, &utp->calcnt) ||
+                        __put_user(txc.errcnt, &utp->errcnt) ||
+                        __put_user(txc.stbcnt, &utp->stbcnt))
+                ret = -EFAULT;
+        return ret;
+}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e882c6babf..fe2b8d0bfe 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
 /* This protects CPUs going up and down... */
 static DECLARE_MUTEX(cpucontrol);
-static struct notifier_block *cpu_chain;
+static BLOCKING_NOTIFIER_HEAD(cpu_chain);
 #ifdef CONFIG_HOTPLUG_CPU
 static struct task_struct *lock_cpu_hotplug_owner;
@@ -71,21 +71,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
 /* Need to know about CPUs going up/down? */
 int register_cpu_notifier(struct notifier_block *nb)
 {
-        int ret;
+        return blocking_notifier_chain_register(&cpu_chain, nb);
-        if ((ret = lock_cpu_hotplug_interruptible()) != 0)
-                return ret;
-        ret = notifier_chain_register(&cpu_chain, nb);
-        unlock_cpu_hotplug();
-        return ret;
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
-        lock_cpu_hotplug();
+        blocking_notifier_chain_unregister(&cpu_chain, nb);
-        notifier_chain_unregister(&cpu_chain, nb);
-        unlock_cpu_hotplug();
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
@@ -141,7 +133,7 @@ int cpu_down(unsigned int cpu)
                goto out;
        }
-        err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+        err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
                                                (void *)(long)cpu);
        if (err == NOTIFY_BAD) {
                printk("%s: attempt to take down CPU %u failed\n",
@@ -159,7 +151,7 @@ int cpu_down(unsigned int cpu)
        p = __stop_machine_run(take_cpu_down, NULL, cpu);
        if (IS_ERR(p)) {
                /* CPU didn't die: tell everyone.  Can't complain. */
-                if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+                if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
                                (void *)(long)cpu) == NOTIFY_BAD)
                        BUG();
@@ -182,8 +174,8 @@ int cpu_down(unsigned int cpu)
        put_cpu();
        /* CPU is completely dead: tell everyone.  Too late to complain. */
-        if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu)
+        if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD,
-            == NOTIFY_BAD)
+                        (void *)(long)cpu) == NOTIFY_BAD)
                BUG();
        check_for_tasks(cpu);
@@ -211,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu)
                goto out;
        }
-        ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+        ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
        if (ret == NOTIFY_BAD) {
                printk("%s: attempt to bring up CPU %u failed\n",
                                __FUNCTION__, cpu);
@@ -223,15 +215,15 @@ int __devinit cpu_up(unsigned int cpu)
        ret = __cpu_up(cpu);
        if (ret != 0)
                goto out_notify;
-        if (!cpu_online(cpu))
+        BUG_ON(!cpu_online(cpu));
-                BUG();
        /* Now call notifier in preparation. */
-        notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+        blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
 out_notify:
        if (ret != 0)
-                notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
+                blocking_notifier_call_chain(&cpu_chain,
+                                CPU_UP_CANCELED, hcpu);
 out:
        unlock_cpu_hotplug();
        return ret;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba42b0a769..ab81fdd457 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -4,15 +4,14 @@
 *  Processor and Memory placement constraints for sets of tasks.
 *
 *  Copyright (C) 2003 BULL SA.
- *  Copyright (C) 2004 Silicon Graphics, Inc.
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
- *  Portions Copyright (c) 2004 Silicon Graphics, Inc.
 *
- *  2003-10-10 Written by Simon Derr <simon.derr@bull.net>
+ *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
- *  2004 May-July Rework by Paul Jackson <pj@sgi.com>
+ *  2004 May-July Rework by Paul Jackson.
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
@@ -53,7 +52,7 @@
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 #define CPUSET_SUPER_MAGIC              0x27e0eb
@@ -108,37 +107,49 @@ typedef enum {
        CS_MEM_EXCLUSIVE,
        CS_MEMORY_MIGRATE,
        CS_REMOVED,
-        CS_NOTIFY_ON_RELEASE
+        CS_NOTIFY_ON_RELEASE,
+        CS_SPREAD_PAGE,
+        CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
-        return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+        return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
 }
 static inline int is_mem_exclusive(const struct cpuset *cs)
 {
-        return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
+        return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
 }
 static inline int is_removed(const struct cpuset *cs)
 {
-        return !!test_bit(CS_REMOVED, &cs->flags);
+        return test_bit(CS_REMOVED, &cs->flags);
 }
 static inline int notify_on_release(const struct cpuset *cs)
 {
-        return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+        return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
 }
 static inline int is_memory_migrate(const struct cpuset *cs)
 {
-        return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+        return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+static inline int is_spread_page(const struct cpuset *cs)
+{
+        return test_bit(CS_SPREAD_PAGE, &cs->flags);
+}
+static inline int is_spread_slab(const struct cpuset *cs)
+{
+        return test_bit(CS_SPREAD_SLAB, &cs->flags);
 }
 /*
- * Increment this atomic integer everytime any cpuset changes its
+ * Increment this integer everytime any cpuset changes its
 * mems_allowed value.  Users of cpusets can track this generation
 * number, and avoid having to lock and reload mems_allowed unless
 * the cpuset they're using changes generation.
@@ -152,8 +163,11 @@ static inline int is_memory_migrate(const struct cpuset *cs)
 * on every visit to __alloc_pages(), to efficiently check whether
 * its current->cpuset->mems_allowed has changed, requiring an update
 * of its current->mems_allowed.
+ *
+ * Since cpuset_mems_generation is guarded by manage_mutex,
+ * there is no need to mark it atomic.
 */
-static atomic_t cpuset_mems_generation = ATOMIC_INIT(1);
+static int cpuset_mems_generation;
 static struct cpuset top_cpuset = {
        .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
@@ -168,63 +182,57 @@ static struct vfsmount *cpuset_mount;
 static struct super_block *cpuset_sb;
 /*
- * We have two global cpuset semaphores below.  They can nest.
+ * We have two global cpuset mutexes below.  They can nest.
- * It is ok to first take manage_sem, then nest callback_sem.  We also
+ * It is ok to first take manage_mutex, then nest callback_mutex.  We also
 * require taking task_lock() when dereferencing a tasks cpuset pointer.
 * See "The task_lock() exception", at the end of this comment.
 *
- * A task must hold both semaphores to modify cpusets.  If a task
+ * A task must hold both mutexes to modify cpusets.  If a task
- * holds manage_sem, then it blocks others wanting that semaphore,
+ * holds manage_mutex, then it blocks others wanting that mutex,
- * ensuring that it is the only task able to also acquire callback_sem
+ * ensuring that it is the only task able to also acquire callback_mutex
 * and be able to modify cpusets.  It can perform various checks on
 * the cpuset structure first, knowing nothing will change.  It can
- * also allocate memory while just holding manage_sem.  While it is
+ * also allocate memory while just holding manage_mutex.  While it is
 * performing these checks, various callback routines can briefly
- * acquire callback_sem to query cpusets.  Once it is ready to make
+ * acquire callback_mutex to query cpusets.  Once it is ready to make
- * the changes, it takes callback_sem, blocking everyone else.
+ * the changes, it takes callback_mutex, blocking everyone else.
 *
 * Calls to the kernel memory allocator can not be made while holding
- * callback_sem, as that would risk double tripping on callback_sem
+ * callback_mutex, as that would risk double tripping on callback_mutex
 * from one of the callbacks into the cpuset code from within
 * __alloc_pages().
 *
- * If a task is only holding callback_sem, then it has read-only
+ * If a task is only holding callback_mutex, then it has read-only
 * access to cpusets.
 *
 * The task_struct fields mems_allowed and mems_generation may only
 * be accessed in the context of that task, so require no locks.
 *
 * Any task can increment and decrement the count field without lock.
- * So in general, code holding manage_sem or callback_sem can't rely
+ * So in general, code holding manage_mutex or callback_mutex can't rely
 * on the count field not changing.  However, if the count goes to
- * zero, then only attach_task(), which holds both semaphores, can
+ * zero, then only attach_task(), which holds both mutexes, can
 * increment it again.  Because a count of zero means that no tasks
 * are currently attached, therefore there is no way a task attached
 * to that cpuset can fork (the other way to increment the count).
- * So code holding manage_sem or callback_sem can safely assume that
+ * So code holding manage_mutex or callback_mutex can safely assume that
 * if the count is zero, it will stay zero.  Similarly, if a task
- * holds manage_sem or callback_sem on a cpuset with zero count, it
+ * holds manage_mutex or callback_mutex on a cpuset with zero count, it
 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
- * both of those semaphores.
+ * both of those mutexes.
- *
- * A possible optimization to improve parallelism would be to make
- * callback_sem a R/W semaphore (rwsem), allowing the callback routines
- * to proceed in parallel, with read access, until the holder of
- * manage_sem needed to take this rwsem for exclusive write access
- * and modify some cpusets.
 *
 * The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds manage_sem across the entire operation,
+ * the cpuset hierarchy holds manage_mutex across the entire operation,
 * single threading all such cpuset modifications across the system.
 *
- * The cpuset_common_file_read() handlers only hold callback_sem across
+ * The cpuset_common_file_read() handlers only hold callback_mutex across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
 *
 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
- * (usually) take either semaphore.  These are the two most performance
+ * (usually) take either mutex.  These are the two most performance
 * critical pieces of code here.  The exception occurs on cpuset_exit(),
- * when a task in a notify_on_release cpuset exits.  Then manage_sem
+ * when a task in a notify_on_release cpuset exits.  Then manage_mutex
 * is taken, and if the cpuset count is zero, a usermode call made
 * to /sbin/cpuset_release_agent with the name of the cpuset (path
 * relative to the root of cpuset file system) as the argument.
@@ -242,9 +250,9 @@ static struct super_block *cpuset_sb;
 *
 * The need for this exception arises from the action of attach_task(),
 * which overwrites one tasks cpuset pointer with another.  It does
- * so using both semaphores, however there are several performance
+ * so using both mutexes, however there are several performance
 * critical places that need to reference task->cpuset without the
- * expense of grabbing a system global semaphore.  Therefore except as
+ * expense of grabbing a system global mutex.  Therefore except as
 * noted below, when dereferencing or, as in attach_task(), modifying
 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
 * (task->alloc_lock) already in the task_struct routinely used for
@@ -256,8 +264,8 @@ static struct super_block *cpuset_sb;
 * the routine cpuset_update_task_memory_state().
 */
-static DECLARE_MUTEX(manage_sem);
+static DEFINE_MUTEX(manage_mutex);
-static DECLARE_MUTEX(callback_sem);
+static DEFINE_MUTEX(callback_mutex);
 /*
 * A couple of forward declarations required, due to cyclic reference loop:
@@ -432,7 +440,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 }
 /*
- * Call with manage_sem held.  Writes path of cpuset into buf.
+ * Call with manage_mutex held.  Writes path of cpuset into buf.
 * Returns 0 on success, -errno on error.
 */
@@ -484,11 +492,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
 * status of the /sbin/cpuset_release_agent task, so no sense holding
 * our caller up for that.
 *
- * When we had only one cpuset semaphore, we had to call this
+ * When we had only one cpuset mutex, we had to call this
 * without holding it, to avoid deadlock when call_usermodehelper()
 * allocated memory.  With two locks, we could now call this while
- * holding manage_sem, but we still don't, so as to minimize
+ * holding manage_mutex, but we still don't, so as to minimize
- * the time manage_sem is held.
+ * the time manage_mutex is held.
 */
 static void cpuset_release_agent(const char *pathbuf)
@@ -520,15 +528,15 @@ static void cpuset_release_agent(const char *pathbuf)
 * cs is notify_on_release() and now both the user count is zero and
 * the list of children is empty, prepare cpuset path in a kmalloc'd
 * buffer, to be returned via ppathbuf, so that the caller can invoke
- * cpuset_release_agent() with it later on, once manage_sem is dropped.
+ * cpuset_release_agent() with it later on, once manage_mutex is dropped.
- * Call here with manage_sem held.
+ * Call here with manage_mutex held.
 *
 * This check_for_release() routine is responsible for kmalloc'ing
 * pathbuf.  The above cpuset_release_agent() is responsible for
 * kfree'ing pathbuf.  The caller of these routines is responsible
 * for providing a pathbuf pointer, initialized to NULL, then
- * calling check_for_release() with manage_sem held and the address
+ * calling check_for_release() with manage_mutex held and the address
- * of the pathbuf pointer, then dropping manage_sem, then calling
+ * of the pathbuf pointer, then dropping manage_mutex, then calling
 * cpuset_release_agent() with pathbuf, as set by check_for_release().
 */
@@ -559,7 +567,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
 * One way or another, we guarantee to return some non-empty subset
 * of cpu_online_map.
 *
- * Call with callback_sem held.
+ * Call with callback_mutex held.
 */
 static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -583,7 +591,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
 * One way or another, we guarantee to return some non-empty subset
 * of node_online_map.
 *
- * Call with callback_sem held.
+ * Call with callback_mutex held.
 */
 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -608,12 +616,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 * current->cpuset if a task has its memory placement changed.
 * Do not call this routine if in_interrupt().
 *
- * Call without callback_sem or task_lock() held.  May be called
+ * Call without callback_mutex or task_lock() held.  May be
- * with or without manage_sem held.  Doesn't need task_lock to guard
+ * called with or without manage_mutex held.  Thanks in part to
- * against another task changing a non-NULL cpuset pointer to NULL,
+ * 'the_top_cpuset_hack', the tasks cpuset pointer will never
- * as that is only done by a task on itself, and if the current task
+ * be NULL.  This routine also might acquire callback_mutex and
- * is here, it is not simultaneously in the exit code NULL'ing its
- * cpuset pointer.  This routine also might acquire callback_sem and
 * current->mm->mmap_sem during call.
 *
 * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -658,13 +664,21 @@ void cpuset_update_task_memory_state(void)
        }
        if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
-                down(&callback_sem);
+                mutex_lock(&callback_mutex);
                task_lock(tsk);
                cs = tsk->cpuset;       /* Maybe changed when task not locked */
                guarantee_online_mems(cs, &tsk->mems_allowed);
                tsk->cpuset_mems_generation = cs->mems_generation;
+                if (is_spread_page(cs))
+                        tsk->flags |= PF_SPREAD_PAGE;
+                else
+                        tsk->flags &= ~PF_SPREAD_PAGE;
+                if (is_spread_slab(cs))
+                        tsk->flags |= PF_SPREAD_SLAB;
+                else
+                        tsk->flags &= ~PF_SPREAD_SLAB;
                task_unlock(tsk);
-                up(&callback_sem);
+                mutex_unlock(&callback_mutex);
                mpol_rebind_task(tsk, &tsk->mems_allowed);
        }
 }
@@ -674,7 +688,7 @@ void cpuset_update_task_memory_state(void)
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.  Call holding manage_sem.
+ * are only set if the other's are set.  Call holding manage_mutex.
 */
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -692,7 +706,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
- * manage_sem held.
+ * manage_mutex held.
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
@@ -746,7 +760,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 *    exclusive child cpusets
 * Build these two partitions by calling partition_sched_domains
 *
- * Call with manage_sem held.  May nest a call to the
+ * Call with manage_mutex held.  May nest a call to the
 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
 */
@@ -792,7 +806,7 @@ static void update_cpu_domains(struct cpuset *cur)
 }
 /*
- * Call with manage_sem held.  May take callback_sem during call.
+ * Call with manage_mutex held.  May take callback_mutex during call.
 */
 static int update_cpumask(struct cpuset *cs, char *buf)
@@ -811,15 +825,64 @@ static int update_cpumask(struct cpuset *cs, char *buf)
        if (retval < 0)
                return retval;
        cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        cs->cpus_allowed = trialcs.cpus_allowed;
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        if (is_cpu_exclusive(cs) && !cpus_unchanged)
                update_cpu_domains(cs);
        return 0;
 }
 /*
+ * cpuset_migrate_mm
+ *
+ *    Migrate memory region from one set of nodes to another.
+ *
+ *    Temporarilly set tasks mems_allowed to target nodes of migration,
+ *    so that the migration code can allocate pages on these nodes.
+ *
+ *    Call holding manage_mutex, so our current->cpuset won't change
+ *    during this call, as manage_mutex holds off any attach_task()
+ *    calls.  Therefore we don't need to take task_lock around the
+ *    call to guarantee_online_mems(), as we know no one is changing
+ *    our tasks cpuset.
+ *
+ *    Hold callback_mutex around the two modifications of our tasks
+ *    mems_allowed to synchronize with cpuset_mems_allowed().
+ *
+ *    While the mm_struct we are migrating is typically from some
+ *    other task, the task_struct mems_allowed that we are hacking
+ *    is for our current task, which must allocate new pages for that
+ *    migrating memory region.
+ *
+ *    We call cpuset_update_task_memory_state() before hacking
+ *    our tasks mems_allowed, so that we are assured of being in
+ *    sync with our tasks cpuset, and in particular, callbacks to
+ *    cpuset_update_task_memory_state() from nested page allocations
+ *    won't see any mismatch of our cpuset and task mems_generation
+ *    values, so won't overwrite our hacked tasks mems_allowed
+ *    nodemask.
+ */
+static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+                                                        const nodemask_t *to)
+{
+        struct task_struct *tsk = current;
+        cpuset_update_task_memory_state();
+        mutex_lock(&callback_mutex);
+        tsk->mems_allowed = *to;
+        mutex_unlock(&callback_mutex);
+        do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+        mutex_lock(&callback_mutex);
+        guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed);
+        mutex_unlock(&callback_mutex);
+}
+/*
 * Handle user request to change the 'mems' memory placement
 * of a cpuset.  Needs to validate the request, update the
 * cpusets mems_allowed and mems_generation, and for each
@@ -827,7 +890,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 * the cpuset is marked 'memory_migrate', migrate the tasks
 * pages to the new memory.
 *
- * Call with manage_sem held.  May take callback_sem during call.
+ * Call with manage_mutex held.  May take callback_mutex during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
@@ -862,11 +925,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
        if (retval < 0)
                goto done;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        cs->mems_allowed = trialcs.mems_allowed;
-        atomic_inc(&cpuset_mems_generation);
+        cs->mems_generation = cpuset_mems_generation++;
-        cs->mems_generation = atomic_read(&cpuset_mems_generation);
+        mutex_unlock(&callback_mutex);
-        up(&callback_sem);
        set_cpuset_being_rebound(cs);           /* causes mpol_copy() rebind */
@@ -922,7 +984,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
         * tasklist_lock.  Forks can happen again now - the mpol_copy()
         * cpuset_being_rebound check will catch such forks, and rebind
         * their vma mempolicies too.  Because we still hold the global
-         * cpuset manage_sem, we know that no other rebind effort will
+         * cpuset manage_mutex, we know that no other rebind effort will
         * be contending for the global variable cpuset_being_rebound.
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
@@ -932,10 +994,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                struct mm_struct *mm = mmarray[i];
                mpol_rebind_mm(mm, &cs->mems_allowed);
-                if (migrate) {
+                if (migrate)
-                        do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
+                        cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
-                                                        MPOL_MF_MOVE_ALL);
-                }
                mmput(mm);
        }
@@ -948,7 +1008,7 @@ done:
 }
 /*
- * Call with manage_sem held.
+ * Call with manage_mutex held.
 */
 static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -963,11 +1023,12 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
 /*
 * update_flag - read a 0 or a 1 in a file and update associated flag
 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
- *                              CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
+ *                              CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
+ *                              CS_SPREAD_PAGE, CS_SPREAD_SLAB)
 * cs:  the cpuset to update
 * buf: the buffer where we read the 0 or 1
 *
- * Call with manage_sem held.
+ * Call with manage_mutex held.
 */
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -989,12 +1050,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
                return err;
        cpu_exclusive_changed =
                (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        if (turning_on)
                set_bit(bit, &cs->flags);
        else
                clear_bit(bit, &cs->flags);
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        if (cpu_exclusive_changed)
                update_cpu_domains(cs);
@@ -1104,7 +1165,7 @@ static int fmeter_getrate(struct fmeter *fmp)
 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
 * notified on release.
 *
- * Call holding manage_sem.  May take callback_sem and task_lock of
+ * Call holding manage_mutex.  May take callback_mutex and task_lock of
 * the task 'pid' during call.
 */
@@ -1144,13 +1205,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
                get_task_struct(tsk);
        }
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        task_lock(tsk);
        oldcs = tsk->cpuset;
        if (!oldcs) {
                task_unlock(tsk);
-                up(&callback_sem);
+                mutex_unlock(&callback_mutex);
                put_task_struct(tsk);
                return -ESRCH;
        }
@@ -1164,16 +1225,16 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
        from = oldcs->mems_allowed;
        to = cs->mems_allowed;
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        mm = get_task_mm(tsk);
        if (mm) {
                mpol_rebind_mm(mm, &to);
+                if (is_memory_migrate(cs))
+                        cpuset_migrate_mm(mm, &from, &to);
                mmput(mm);
        }
-        if (is_memory_migrate(cs))
-                do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
        put_task_struct(tsk);
        synchronize_rcu();
        if (atomic_dec_and_test(&oldcs->count))
@@ -1194,6 +1255,8 @@ typedef enum {
        FILE_NOTIFY_ON_RELEASE,
        FILE_MEMORY_PRESSURE_ENABLED,
        FILE_MEMORY_PRESSURE,
+        FILE_SPREAD_PAGE,
+        FILE_SPREAD_SLAB,
        FILE_TASKLIST,
 } cpuset_filetype_t;
@@ -1221,7 +1284,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
        }
        buffer[nbytes] = 0;     /* nul-terminate */
-        down(&manage_sem);
+        mutex_lock(&manage_mutex);
        if (is_removed(cs)) {
                retval = -ENODEV;
@@ -1253,6 +1316,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
        case FILE_MEMORY_PRESSURE:
                retval = -EACCES;
                break;
+        case FILE_SPREAD_PAGE:
+                retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
+                cs->mems_generation = cpuset_mems_generation++;
+                break;
+        case FILE_SPREAD_SLAB:
+                retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
+                cs->mems_generation = cpuset_mems_generation++;
+                break;
        case FILE_TASKLIST:
                retval = attach_task(cs, buffer, &pathbuf);
                break;
@@ -1264,7 +1335,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
        if (retval == 0)
                retval = nbytes;
 out2:
-        up(&manage_sem);
+        mutex_unlock(&manage_mutex);
        cpuset_release_agent(pathbuf);
 out1:
        kfree(buffer);
@@ -1304,9 +1375,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
        cpumask_t mask;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        mask = cs->cpus_allowed;
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1315,9 +1386,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
        nodemask_t mask;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        mask = cs->mems_allowed;
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1362,6 +1433,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
        case FILE_MEMORY_PRESSURE:
                s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
                break;
+        case FILE_SPREAD_PAGE:
+                *s++ = is_spread_page(cs) ? '1' : '0';
+                break;
+        case FILE_SPREAD_SLAB:
+                *s++ = is_spread_slab(cs) ? '1' : '0';
+                break;
        default:
                retval = -EINVAL;
                goto out;
@@ -1598,7 +1675,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
 * Handle an open on 'tasks' file.  Prepare a buffer listing the
 * process id's of tasks currently attached to the cpuset being opened.
 *
- * Does not require any specific cpuset semaphores, and does not take any.
+ * Does not require any specific cpuset mutexes, and does not take any.
 */
 static int cpuset_tasks_open(struct inode *unused, struct file *file)
 {
@@ -1725,6 +1802,16 @@ static struct cftype cft_memory_pressure = {
        .private = FILE_MEMORY_PRESSURE,
 };
+static struct cftype cft_spread_page = {
+        .name = "memory_spread_page",
+        .private = FILE_SPREAD_PAGE,
+};
+static struct cftype cft_spread_slab = {
+        .name = "memory_spread_slab",
+        .private = FILE_SPREAD_SLAB,
+};
 static int cpuset_populate_dir(struct dentry *cs_dentry)
 {
        int err;
@@ -1743,6 +1830,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
                return err;
        if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
                return err;
+        if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0)
+                return err;
+        if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0)
+                return err;
        if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
                return err;
        return 0;
@@ -1754,7 +1845,7 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
 *      name:           name of the new cpuset. Will be strcpy'ed.
 *      mode:           mode to set on new inode
 *
- *      Must be called with the semaphore on the parent inode held
+ *      Must be called with the mutex on the parent inode held
 */
 static long cpuset_create(struct cpuset *parent, const char *name, int mode)
@@ -1766,44 +1857,47 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
        if (!cs)
                return -ENOMEM;
-        down(&manage_sem);
+        mutex_lock(&manage_mutex);
        cpuset_update_task_memory_state();
        cs->flags = 0;
        if (notify_on_release(parent))
                set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+        if (is_spread_page(parent))
+                set_bit(CS_SPREAD_PAGE, &cs->flags);
+        if (is_spread_slab(parent))
+                set_bit(CS_SPREAD_SLAB, &cs->flags);
        cs->cpus_allowed = CPU_MASK_NONE;
        cs->mems_allowed = NODE_MASK_NONE;
        atomic_set(&cs->count, 0);
        INIT_LIST_HEAD(&cs->sibling);
        INIT_LIST_HEAD(&cs->children);
-        atomic_inc(&cpuset_mems_generation);
+        cs->mems_generation = cpuset_mems_generation++;
-        cs->mems_generation = atomic_read(&cpuset_mems_generation);
        fmeter_init(&cs->fmeter);
        cs->parent = parent;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        list_add(&cs->sibling, &cs->parent->children);
        number_of_cpusets++;
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        err = cpuset_create_dir(cs, name, mode);
        if (err < 0)
                goto err;
        /*
-         * Release manage_sem before cpuset_populate_dir() because it
+         * Release manage_mutex before cpuset_populate_dir() because it
         * will down() this new directory's i_mutex and if we race with
         * another mkdir, we might deadlock.
         */
-        up(&manage_sem);
+        mutex_unlock(&manage_mutex);
        err = cpuset_populate_dir(cs->dentry);
        /* If err < 0, we have a half-filled directory - oh well ;) */
        return 0;
 err:
        list_del(&cs->sibling);
-        up(&manage_sem);
+        mutex_unlock(&manage_mutex);
        kfree(cs);
        return err;
 }
@@ -1825,18 +1919,18 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
        /* the vfs holds both inode->i_mutex already */
-        down(&manage_sem);
+        mutex_lock(&manage_mutex);
        cpuset_update_task_memory_state();
        if (atomic_read(&cs->count) > 0) {
-                up(&manage_sem);
+                mutex_unlock(&manage_mutex);
                return -EBUSY;
        }
        if (!list_empty(&cs->children)) {
-                up(&manage_sem);
+                mutex_unlock(&manage_mutex);
                return -EBUSY;
        }
        parent = cs->parent;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        set_bit(CS_REMOVED, &cs->flags);
        if (is_cpu_exclusive(cs))
                update_cpu_domains(cs);
@@ -1848,10 +1942,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
        cpuset_d_remove_dir(d);
        dput(d);
        number_of_cpusets--;
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        if (list_empty(&parent->children))
                check_for_release(parent, &pathbuf);
-        up(&manage_sem);
+        mutex_unlock(&manage_mutex);
        cpuset_release_agent(pathbuf);
        return 0;
 }
@@ -1867,7 +1961,7 @@ int __init cpuset_init_early(void)
        struct task_struct *tsk = current;
        tsk->cpuset = &top_cpuset;
-        tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation);
+        tsk->cpuset->mems_generation = cpuset_mems_generation++;
        return 0;
 }
@@ -1886,8 +1980,7 @@ int __init cpuset_init(void)
        top_cpuset.mems_allowed = NODE_MASK_ALL;
        fmeter_init(&top_cpuset.fmeter);
-        atomic_inc(&cpuset_mems_generation);
+        top_cpuset.mems_generation = cpuset_mems_generation++;
-        top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
        init_task.cpuset = &top_cpuset;
@@ -1960,23 +2053,56 @@ void cpuset_fork(struct task_struct *child)
 * Description: Detach cpuset from @tsk and release it.
 *
 * Note that cpusets marked notify_on_release force every task in
- * them to take the global manage_sem semaphore when exiting.
+ * them to take the global manage_mutex mutex when exiting.
 * This could impact scaling on very large systems.  Be reluctant to
 * use notify_on_release cpusets where very high task exit scaling
 * is required on large systems.
 *
 * Don't even think about derefencing 'cs' after the cpuset use count
- * goes to zero, except inside a critical section guarded by manage_sem
+ * goes to zero, except inside a critical section guarded by manage_mutex
- * or callback_sem.   Otherwise a zero cpuset use count is a license to
+ * or callback_mutex.   Otherwise a zero cpuset use count is a license to
 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
 *
- * This routine has to take manage_sem, not callback_sem, because
+ * This routine has to take manage_mutex, not callback_mutex, because
- * it is holding that semaphore while calling check_for_release(),
+ * it is holding that mutex while calling check_for_release(),
- * which calls kmalloc(), so can't be called holding callback__sem().
+ * which calls kmalloc(), so can't be called holding callback_mutex().
 *
 * We don't need to task_lock() this reference to tsk->cpuset,
 * because tsk is already marked PF_EXITING, so attach_task() won't
 * mess with it, or task is a failed fork, never visible to attach_task.
+ *
+ * the_top_cpuset_hack:
+ *
+ *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
+ *
+ *    Don't leave a task unable to allocate memory, as that is an
+ *    accident waiting to happen should someone add a callout in
+ *    do_exit() after the cpuset_exit() call that might allocate.
+ *    If a task tries to allocate memory with an invalid cpuset,
+ *    it will oops in cpuset_update_task_memory_state().
+ *
+ *    We call cpuset_exit() while the task is still competent to
+ *    handle notify_on_release(), then leave the task attached to
+ *    the root cpuset (top_cpuset) for the remainder of its exit.
+ *
+ *    To do this properly, we would increment the reference count on
+ *    top_cpuset, and near the very end of the kernel/exit.c do_exit()
+ *    code we would add a second cpuset function call, to drop that
+ *    reference.  This would just create an unnecessary hot spot on
+ *    the top_cpuset reference count, to no avail.
+ *
+ *    Normally, holding a reference to a cpuset without bumping its
+ *    count is unsafe.   The cpuset could go away, or someone could
+ *    attach us to a different cpuset, decrementing the count on
+ *    the first cpuset that we never incremented.  But in this case,
+ *    top_cpuset isn't going away, and either task has PF_EXITING set,
+ *    which wards off any attach_task() attempts, or task is a failed
+ *    fork, never visible to attach_task.
+ *
+ *    Another way to do this would be to set the cpuset pointer
+ *    to NULL here, and check in cpuset_update_task_memory_state()
+ *    for a NULL pointer.  This hack avoids that NULL check, for no
+ *    cost (other than this way too long comment ;).
 **/
 void cpuset_exit(struct task_struct *tsk)
@@ -1984,15 +2110,15 @@ void cpuset_exit(struct task_struct *tsk)
        struct cpuset *cs;
        cs = tsk->cpuset;
-        tsk->cpuset = NULL;
+        tsk->cpuset = &top_cpuset;      /* the_top_cpuset_hack - see above */
        if (notify_on_release(cs)) {
                char *pathbuf = NULL;
-                down(&manage_sem);
+                mutex_lock(&manage_mutex);
                if (atomic_dec_and_test(&cs->count))
                        check_for_release(cs, &pathbuf);
-                up(&manage_sem);
+                mutex_unlock(&manage_mutex);
                cpuset_release_agent(pathbuf);
        } else {
                atomic_dec(&cs->count);
@@ -2013,11 +2139,11 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
 {
        cpumask_t mask;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        task_lock(tsk);
        guarantee_online_cpus(tsk->cpuset, &mask);
        task_unlock(tsk);
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        return mask;
 }
@@ -2041,11 +2167,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 {
        nodemask_t mask;
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        task_lock(tsk);
        guarantee_online_mems(tsk->cpuset, &mask);
        task_unlock(tsk);
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        return mask;
 }
@@ -2071,7 +2197,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 /*
 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
- * ancestor to the specified cpuset.  Call holding callback_sem.
+ * ancestor to the specified cpuset.  Call holding callback_mutex.
 * If no ancestor is mem_exclusive (an unusual configuration), then
 * returns the root cpuset.
 */
@@ -2098,37 +2224,44 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 * GFP_KERNEL allocations are not so marked, so can escape to the
 * nearest mem_exclusive ancestor cpuset.
 *
- * Scanning up parent cpusets requires callback_sem.  The __alloc_pages()
+ * Scanning up parent cpusets requires callback_mutex.  The __alloc_pages()
 * routine only calls here with __GFP_HARDWALL bit _not_ set if
 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
 * mems_allowed came up empty on the first pass over the zonelist.
 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
- * short of memory, might require taking the callback_sem semaphore.
+ * short of memory, might require taking the callback_mutex mutex.
- *
+ *
- * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
+ * The first call here from mm/page_alloc:get_page_from_freelist()
- * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
- * hardwall cpusets - no allocation on a node outside the cpuset is
+ * no allocation on a node outside the cpuset is allowed (unless in
- * allowed (unless in interrupt, of course).
+ * interrupt, of course).
- *
+ *
- * The second loop doesn't even call here for GFP_ATOMIC requests
+ * The second pass through get_page_from_freelist() doesn't even call
- * (if the __alloc_pages() local variable 'wait' is set).  That check
+ * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
- * and the checks below have the combined affect in the second loop of
+ * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
- * the __alloc_pages() routine that:
+ * in alloc_flags.  That logic and the checks below have the combined
+ * affect that:
 *      in_interrupt - any node ok (current task context irrelevant)
 *      GFP_ATOMIC   - any node ok
 *      GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
+ *
+ * Rule:
+ *    Don't call cpuset_zone_allowed() if you can't sleep, unless you
+ *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
+ *    the code that might scan up ancestor cpusets and sleep.
 **/
 int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
        int node;                       /* node that zone z is on */
        const struct cpuset *cs;        /* current cpuset ancestors */
-        int allowed = 1;                /* is allocation in zone z allowed? */
+        int allowed;                    /* is allocation in zone z allowed? */
        if (in_interrupt())
                return 1;
        node = z->zone_pgdat->node_id;
+        might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
        if (node_isset(node, current->mems_allowed))
                return 1;
        if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
@@ -2138,31 +2271,31 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
                return 1;
        /* Not hardwall and node outside mems_allowed: scan up cpusets */
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
        task_lock(current);
        cs = nearest_exclusive_ancestor(current->cpuset);
        task_unlock(current);
        allowed = node_isset(node, cs->mems_allowed);
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
        return allowed;
 }
 /**
 * cpuset_lock - lock out any changes to cpuset structures
 *
- * The out of memory (oom) code needs to lock down cpusets
+ * The out of memory (oom) code needs to mutex_lock cpusets
 * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset.  Expose callback_sem via this
+ * task in an overlapping cpuset.  Expose callback_mutex via this
 * cpuset_lock() routine, so the oom code can lock it, before
 * locking the task list.  The tasklist_lock is a spinlock, so
- * must be taken inside callback_sem.
+ * must be taken inside callback_mutex.
 */
 void cpuset_lock(void)
 {
-        down(&callback_sem);
+        mutex_lock(&callback_mutex);
 }
 /**
@@ -2173,10 +2306,48 @@ void cpuset_lock(void)
 void cpuset_unlock(void)
 {
-        up(&callback_sem);
+        mutex_unlock(&callback_mutex);
 }
 /**
+ * cpuset_mem_spread_node() - On which node to begin search for a page
+ *
+ * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
+ * tasks in a cpuset with is_spread_page or is_spread_slab set),
+ * and if the memory allocation used cpuset_mem_spread_node()
+ * to determine on which node to start looking, as it will for
+ * certain page cache or slab cache pages such as used for file
+ * system buffers and inode caches, then instead of starting on the
+ * local node to look for a free page, rather spread the starting
+ * node around the tasks mems_allowed nodes.
+ *
+ * We don't have to worry about the returned node being offline
+ * because "it can't happen", and even if it did, it would be ok.
+ *
+ * The routines calling guarantee_online_mems() are careful to
+ * only set nodes in task->mems_allowed that are online.  So it
+ * should not be possible for the following code to return an
+ * offline node.  But if it did, that would be ok, as this routine
+ * is not returning the node where the allocation must be, only
+ * the node where the search should start.  The zonelist passed to
+ * __alloc_pages() will include all nodes.  If the slab allocator
+ * is passed an offline node, it will fall back to the local node.
+ * See kmem_cache_alloc_node().
+ */
+int cpuset_mem_spread_node(void)
+{
+        int node;
+        node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+        if (node == MAX_NUMNODES)
+                node = first_node(current->mems_allowed);
+        current->cpuset_mem_spread_rotor = node;
+        return node;
+}
+EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
+/**
 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
 * @p: pointer to task_struct of some other task.
 *
@@ -2185,7 +2356,7 @@ void cpuset_unlock(void)
 * determine if task @p's memory usage might impact the memory
 * available to the current task.
 *
- * Call while holding callback_sem.
+ * Call while holding callback_mutex.
 **/
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -2256,13 +2427,13 @@ void __cpuset_memory_pressure_bump(void)
 *  - Used for /proc/<pid>/cpuset.
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
 *    doesn't really matter if tsk->cpuset changes after we read it,
- *    and we take manage_sem, keeping attach_task() from changing it
+ *    and we take manage_mutex, keeping attach_task() from changing it
- *    anyway.
+ *    anyway.  No need to check that tsk->cpuset != NULL, thanks to
+ *    the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
+ *    cpuset to top_cpuset.
 */
 static int proc_cpuset_show(struct seq_file *m, void *v)
 {
-        struct cpuset *cs;
        struct task_struct *tsk;
        char *buf;
        int retval = 0;
@@ -2272,20 +2443,14 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
                return -ENOMEM;
        tsk = m->private;
-        down(&manage_sem);
+        mutex_lock(&manage_mutex);
-        cs = tsk->cpuset;
+        retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
-        if (!cs) {
-                retval = -EINVAL;
-                goto out;
-        }
-        retval = cpuset_path(cs, buf, PAGE_SIZE);
        if (retval < 0)
                goto out;
        seq_puts(m, buf);
        seq_putc(m, '\n');
 out:
-        up(&manage_sem);
+        mutex_unlock(&manage_mutex);
        kfree(buf);
        return retval;
 }
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 867d6dbeb5..c01cead2cf 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -140,6 +140,7 @@ __set_personality(u_long personality)
        ep = lookup_exec_domain(personality);
        if (ep == current_thread_info()->exec_domain) {
                current->personality = personality;
+                module_put(ep->module);
                return 0;
        }
diff --git a/kernel/exit.c b/kernel/exit.c
index 93cee36713..e95b932822 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -29,8 +29,13 @@
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <linux/posix-timers.h>
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
+#include <linux/futex.h>
+#include <linux/compat.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/audit.h> /* for audit_free() */
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -48,15 +53,85 @@ static void __unhash_process(struct task_struct *p)
 {
        nr_threads--;
        detach_pid(p, PIDTYPE_PID);
-        detach_pid(p, PIDTYPE_TGID);
        if (thread_group_leader(p)) {
                detach_pid(p, PIDTYPE_PGID);
                detach_pid(p, PIDTYPE_SID);
-                if (p->pid)
-                        __get_cpu_var(process_counts)--;
+                list_del_rcu(&p->tasks);
+                __get_cpu_var(process_counts)--;
+        }
+        list_del_rcu(&p->thread_group);
+        remove_parent(p);
+}
+/*
+ * This function expects the tasklist_lock write-locked.
+ */
+static void __exit_signal(struct task_struct *tsk)
+{
+        struct signal_struct *sig = tsk->signal;
+        struct sighand_struct *sighand;
+        BUG_ON(!sig);
+        BUG_ON(!atomic_read(&sig->count));
+        rcu_read_lock();
+        sighand = rcu_dereference(tsk->sighand);
+        spin_lock(&sighand->siglock);
+        posix_cpu_timers_exit(tsk);
+        if (atomic_dec_and_test(&sig->count))
+                posix_cpu_timers_exit_group(tsk);
+        else {
+                /*
+                 * If there is any task waiting for the group exit
+                 * then notify it:
+                 */
+                if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
+                        wake_up_process(sig->group_exit_task);
+                        sig->group_exit_task = NULL;
+                }
+                if (tsk == sig->curr_target)
+                        sig->curr_target = next_thread(tsk);
+                /*
+                 * Accumulate here the counters for all threads but the
+                 * group leader as they die, so they can be added into
+                 * the process-wide totals when those are taken.
+                 * The group leader stays around as a zombie as long
+                 * as there are other threads.  When it gets reaped,
+                 * the exit.c code will add its counts into these totals.
+                 * We won't ever get here for the group leader, since it
+                 * will have been the last reference on the signal_struct.
+                 */
+                sig->utime = cputime_add(sig->utime, tsk->utime);
+                sig->stime = cputime_add(sig->stime, tsk->stime);
+                sig->min_flt += tsk->min_flt;
+                sig->maj_flt += tsk->maj_flt;
+                sig->nvcsw += tsk->nvcsw;
+                sig->nivcsw += tsk->nivcsw;
+                sig->sched_time += tsk->sched_time;
+                sig = NULL; /* Marker for below. */
        }
-        REMOVE_LINKS(p);
+        __unhash_process(tsk);
+        tsk->signal = NULL;
+        tsk->sighand = NULL;
+        spin_unlock(&sighand->siglock);
+        rcu_read_unlock();
+        __cleanup_sighand(sighand);
+        clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+        flush_sigqueue(&tsk->pending);
+        if (sig) {
+                flush_sigqueue(&sig->shared_pending);
+                __cleanup_signal(sig);
+        }
+}
+static void delayed_put_task_struct(struct rcu_head *rhp)
+{
+        put_task_struct(container_of(rhp, struct task_struct, rcu));
 }
 void release_task(struct task_struct * p)
@@ -65,21 +140,14 @@ void release_task(struct task_struct * p)
        task_t *leader;
        struct dentry *proc_dentry;
-repeat: 
+repeat:
        atomic_dec(&p->user->processes);
        spin_lock(&p->proc_lock);
        proc_dentry = proc_pid_unhash(p);
        write_lock_irq(&tasklist_lock);
-        if (unlikely(p->ptrace))
+        ptrace_unlink(p);
-                __ptrace_unlink(p);
        BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
        __exit_signal(p);
-        /*
-         * Note that the fastpath in sys_times depends on __exit_signal having
-         * updated the counters before a task is removed from the tasklist of
-         * the process by __unhash_process.
-         */
-        __unhash_process(p);
        /*
         * If we are the last non-leader member of the thread
@@ -107,28 +175,13 @@ repeat:
        spin_unlock(&p->proc_lock);
        proc_pid_flush(proc_dentry);
        release_thread(p);
-        put_task_struct(p);
+        call_rcu(&p->rcu, delayed_put_task_struct);
        p = leader;
        if (unlikely(zap_leader))
                goto repeat;
 }
-/* we are using it only for SMP init */
-void unhash_process(struct task_struct *p)
-{
-        struct dentry *proc_dentry;
-        spin_lock(&p->proc_lock);
-        proc_dentry = proc_pid_unhash(p);
-        write_lock_irq(&tasklist_lock);
-        __unhash_process(p);
-        write_unlock_irq(&tasklist_lock);
-        spin_unlock(&p->proc_lock);
-        proc_pid_flush(proc_dentry);
-}
 /*
 * This checks not only the pgrp, but falls back on the pid if no
 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
@@ -236,10 +289,10 @@ static void reparent_to_init(void)
        ptrace_unlink(current);
        /* Reparent to init */
-        REMOVE_LINKS(current);
+        remove_parent(current);
        current->parent = child_reaper;
        current->real_parent = child_reaper;
-        SET_LINKS(current);
+        add_parent(current);
        /* Set the exit signal to SIGCHLD so we signal init on exit */
        current->exit_signal = SIGCHLD;
@@ -345,9 +398,9 @@ void daemonize(const char *name, ...)
        exit_mm(current);
        set_special_pids(1, 1);
-        down(&tty_sem);
+        mutex_lock(&tty_mutex);
        current->signal->tty = NULL;
-        up(&tty_sem);
+        mutex_unlock(&tty_mutex);
        /* Block and flush all signals */
        sigfillset(&blocked);
@@ -360,6 +413,9 @@ void daemonize(const char *name, ...)
        fs = init_task.fs;
        current->fs = fs;
        atomic_inc(&fs->count);
+        exit_namespace(current);
+        current->namespace = init_task.namespace;
+        get_namespace(current->namespace);
        exit_files(current);
        current->files = init_task.files;
        atomic_inc(&current->files->count);
@@ -533,13 +589,13 @@ static void exit_mm(struct task_struct * tsk)
        mmput(mm);
 }
-static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+static inline void choose_new_parent(task_t *p, task_t *reaper)
 {
        /*
         * Make sure we're not reparenting to ourselves and that
         * the parent is not a zombie.
         */
-        BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE);
+        BUG_ON(p == reaper || reaper->exit_state);
        p->real_parent = reaper;
 }
@@ -564,9 +620,9 @@ static void reparent_thread(task_t *p, task_t *father, int traced)
                 * anyway, so let go of it.
                 */
                p->ptrace = 0;
-                list_del_init(&p->sibling);
+                remove_parent(p);
                p->parent = p->real_parent;
-                list_add_tail(&p->sibling, &p->parent->children);
+                add_parent(p);
                /* If we'd notified the old parent about this child's death,
                 * also notify the new parent.
@@ -640,7 +696,7 @@ static void forget_original_parent(struct task_struct * father,
                if (father == p->real_parent) {
                        /* reparent with a reaper, real father it's us */
-                        choose_new_parent(p, reaper, child_reaper);
+                        choose_new_parent(p, reaper);
                        reparent_thread(p, father, 0);
                } else {
                        /* reparent ptraced task to its real parent */
@@ -661,7 +717,7 @@ static void forget_original_parent(struct task_struct * father,
        }
        list_for_each_safe(_p, _n, &father->ptrace_children) {
                p = list_entry(_p,struct task_struct,ptrace_list);
-                choose_new_parent(p, reaper, child_reaper);
+                choose_new_parent(p, reaper);
                reparent_thread(p, father, 1);
        }
 }
@@ -802,10 +858,8 @@ fastcall NORET_TYPE void do_exit(long code)
                panic("Aiee, killing interrupt handler!");
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
-        if (unlikely(tsk->pid == 1))
+        if (unlikely(tsk == child_reaper))
                panic("Attempted to kill init!");
-        if (tsk->io_context)
-                exit_io_context();
        if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
                current->ptrace_message = code;
@@ -819,6 +873,8 @@ fastcall NORET_TYPE void do_exit(long code)
        if (unlikely(tsk->flags & PF_EXITING)) {
                printk(KERN_ALERT
                        "Fixing recursive fault but reboot is needed!\n");
+                if (tsk->io_context)
+                        exit_io_context();
                set_current_state(TASK_UNINTERRUPTIBLE);
                schedule();
        }
@@ -849,6 +905,14 @@ fastcall NORET_TYPE void do_exit(long code)
                exit_itimers(tsk->signal);
                acct_process(code);
        }
+        if (unlikely(tsk->robust_list))
+                exit_robust_list(tsk);
+#ifdef CONFIG_COMPAT
+        if (unlikely(tsk->compat_robust_list))
+                compat_exit_robust_list(tsk);
+#endif
+        if (unlikely(tsk->audit_context))
+                audit_free(tsk);
        exit_mm(tsk);
        exit_sem(tsk);
@@ -878,6 +942,12 @@ fastcall NORET_TYPE void do_exit(long code)
         */
        mutex_debug_check_no_locks_held(tsk);
+        if (tsk->io_context)
+                exit_io_context();
+        if (tsk->splice_pipe)
+                __free_pipe_info(tsk->splice_pipe);
        /* PF_DEAD causes final put_task_struct after we schedule. */
        preempt_disable();
        BUG_ON(tsk->flags & PF_DEAD);
@@ -906,13 +976,6 @@ asmlinkage long sys_exit(int error_code)
        do_exit((error_code&0xff)<<8);
 }
-task_t fastcall *next_thread(const task_t *p)
-{
-        return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
-}
-EXPORT_SYMBOL(next_thread);
 /*
 * Take down every thread in the group.  This is called by fatal signals
 * as well as by sys_exit_group (below).
@@ -927,7 +990,6 @@ do_group_exit(int exit_code)
        else if (!thread_group_empty(current)) {
                struct signal_struct *const sig = current->signal;
                struct sighand_struct *const sighand = current->sighand;
-                read_lock(&tasklist_lock);
                spin_lock_irq(&sighand->siglock);
                if (sig->flags & SIGNAL_GROUP_EXIT)
                        /* Another thread got here before we took the lock.  */
@@ -937,7 +999,6 @@ do_group_exit(int exit_code)
                        zap_other_threads(current);
                }
                spin_unlock_irq(&sighand->siglock);
-                read_unlock(&tasklist_lock);
        }
        do_exit(exit_code);
@@ -1267,7 +1328,7 @@ bail_ref:
        /* move to end of parent's list to avoid starvation */
        remove_parent(p);
-        add_parent(p, p->parent);
+        add_parent(p);
        write_unlock_irq(&tasklist_lock);
diff --git a/kernel/extable.c b/kernel/extable.c
index 7501b531ce..7fe2628553 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,7 +40,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
        return e;
 }
-static int core_kernel_text(unsigned long addr)
+int core_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext &&
            addr <= (unsigned long)_etext)
diff --git a/kernel/fork.c b/kernel/fork.c
index 8e88b374ce..ac8100e308 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -84,7 +84,7 @@ static kmem_cache_t *task_struct_cachep;
 #endif
 /* SLAB cache for signal_struct structures (tsk->signal) */
-kmem_cache_t *signal_cachep;
+static kmem_cache_t *signal_cachep;
 /* SLAB cache for sighand_struct structures (tsk->sighand) */
 kmem_cache_t *sighand_cachep;
@@ -114,8 +114,6 @@ void __put_task_struct(struct task_struct *tsk)
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
-        if (unlikely(tsk->audit_context))
-                audit_free(tsk);
        security_task_free(tsk);
        free_uid(tsk->user);
        put_group_info(tsk->group_info);
@@ -179,6 +177,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        /* One for us, one for whoever does the "release_task()" (usually parent) */
        atomic_set(&tsk->usage,2);
        atomic_set(&tsk->fs_excl, 0);
+        tsk->btrace_seq = 0;
+        tsk->splice_pipe = NULL;
        return tsk;
 }
@@ -605,12 +605,12 @@ static struct files_struct *alloc_files(void)
        atomic_set(&newf->count, 1);
        spin_lock_init(&newf->file_lock);
+        newf->next_fd = 0;
        fdt = &newf->fdtab;
-        fdt->next_fd = 0;
        fdt->max_fds = NR_OPEN_DEFAULT;
-        fdt->max_fdset = __FD_SETSIZE;
+        fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
-        fdt->close_on_exec = &newf->close_on_exec_init;
+        fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
-        fdt->open_fds = &newf->open_fds_init;
+        fdt->open_fds = (fd_set *)&newf->open_fds_init;
        fdt->fd = &newf->fd_array[0];
        INIT_RCU_HEAD(&fdt->rcu);
        fdt->free_files = NULL;
@@ -718,7 +718,7 @@ out_release:
        free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
        free_fd_array(new_fdt->fd, new_fdt->max_fds);
        kmem_cache_free(files_cachep, newf);
-        goto out;
+        return NULL;
 }
 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
@@ -766,8 +766,7 @@ int unshare_files(void)
        struct files_struct *files  = current->files;
        int rc;
-        if(!files)
+        BUG_ON(!files);
-                BUG();
        /* This can race but the race causes us to copy when we don't
           need to and drop the copy */
@@ -784,14 +783,6 @@ int unshare_files(void)
 EXPORT_SYMBOL(unshare_files);
-void sighand_free_cb(struct rcu_head *rhp)
-{
-        struct sighand_struct *sp;
-        sp = container_of(rhp, struct sighand_struct, rcu);
-        kmem_cache_free(sighand_cachep, sp);
-}
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
        struct sighand_struct *sig;
@@ -804,12 +795,17 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
        rcu_assign_pointer(tsk->sighand, sig);
        if (!sig)
                return -ENOMEM;
-        spin_lock_init(&sig->siglock);
        atomic_set(&sig->count, 1);
        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
        return 0;
 }
+void __cleanup_sighand(struct sighand_struct *sighand)
+{
+        if (atomic_dec_and_test(&sighand->count))
+                kmem_cache_free(sighand_cachep, sighand);
+}
 static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
 {
        struct signal_struct *sig;
@@ -845,7 +841,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
        sig->it_real_incr.tv64 = 0;
        sig->real_timer.function = it_real_fn;
-        sig->real_timer.data = tsk;
+        sig->tsk = tsk;
        sig->it_virt_expires = cputime_zero;
        sig->it_virt_incr = cputime_zero;
@@ -879,6 +875,22 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
        return 0;
 }
+void __cleanup_signal(struct signal_struct *sig)
+{
+        exit_thread_group_keys(sig);
+        kmem_cache_free(signal_cachep, sig);
+}
+static inline void cleanup_signal(struct task_struct *tsk)
+{
+        struct signal_struct *sig = tsk->signal;
+        atomic_dec(&sig->live);
+        if (atomic_dec_and_test(&sig->count))
+                __cleanup_signal(sig);
+}
 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long new_flags = p->flags;
@@ -1018,6 +1030,7 @@ static task_t *copy_process(unsigned long clone_flags,
                p->mempolicy = NULL;
                goto bad_fork_cleanup_cpuset;
        }
+        mpol_fix_fork_child_flag(p);
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
@@ -1058,6 +1071,15 @@ static task_t *copy_process(unsigned long clone_flags,
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+        p->robust_list = NULL;
+#ifdef CONFIG_COMPAT
+        p->compat_robust_list = NULL;
+#endif
+        /*
+         * sigaltstack should be cleared when sharing the same VM
+         */
+        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
+                p->sas_ss_sp = p->sas_ss_size = 0;
        /*
         * Syscall tracing should be turned off in the child regardless
@@ -1083,6 +1105,7 @@ static task_t *copy_process(unsigned long clone_flags,
         * We dont wake it up yet.
         */
        p->group_leader = p;
+        INIT_LIST_HEAD(&p->thread_group);
        INIT_LIST_HEAD(&p->ptrace_children);
        INIT_LIST_HEAD(&p->ptrace_list);
@@ -1106,16 +1129,6 @@ static task_t *copy_process(unsigned long clone_flags,
                        !cpu_online(task_cpu(p))))
                set_task_cpu(p, smp_processor_id());
-        /*
-         * Check for pending SIGKILL! The new thread should not be allowed
-         * to slip out of an OOM kill. (or normal SIGKILL.)
-         */
-        if (sigismember(&current->pending.signal, SIGKILL)) {
-                write_unlock_irq(&tasklist_lock);
-                retval = -EINTR;
-                goto bad_fork_cleanup_namespace;
-        }
        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
                p->real_parent = current->real_parent;
@@ -1123,8 +1136,25 @@ static task_t *copy_process(unsigned long clone_flags,
                p->real_parent = current;
        p->parent = p->real_parent;
+        spin_lock(&current->sighand->siglock);
+        /*
+         * Process group and session signals need to be delivered to just the
+         * parent before the fork or both the parent and the child after the
+         * fork. Restart if a signal comes in before we add the new process to
+         * it's process group.
+         * A fatal signal pending means that current will exit, so the new
+         * thread can't slip out of an OOM kill (or normal SIGKILL).
+         */
+        recalc_sigpending();
+        if (signal_pending(current)) {
+                spin_unlock(&current->sighand->siglock);
+                write_unlock_irq(&tasklist_lock);
+                retval = -ERESTARTNOINTR;
+                goto bad_fork_cleanup_namespace;
+        }
        if (clone_flags & CLONE_THREAD) {
-                spin_lock(&current->sighand->siglock);
                /*
                 * Important: if an exit-all has been started then
                 * do not create this new thread - the whole thread
@@ -1136,17 +1166,9 @@ static task_t *copy_process(unsigned long clone_flags,
                        retval = -EAGAIN;
                        goto bad_fork_cleanup_namespace;
                }
-                p->group_leader = current->group_leader;
-                if (current->signal->group_stop_count > 0) {
+                p->group_leader = current->group_leader;
-                        /*
+                list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
-                         * There is an all-stop in progress for the group.
-                         * We ourselves will stop as soon as we check signals.
-                         * Make the new thread part of that group stop too.
-                         */
-                        current->signal->group_stop_count++;
-                        set_tsk_thread_flag(p, TIF_SIGPENDING);
-                }
                if (!cputime_eq(current->signal->it_virt_expires,
                                cputime_zero) ||
@@ -1162,8 +1184,6 @@ static task_t *copy_process(unsigned long clone_flags,
                         */
                        p->it_prof_expires = jiffies_to_cputime(1);
                }
-                spin_unlock(&current->sighand->siglock);
        }
        /*
@@ -1171,24 +1191,27 @@ static task_t *copy_process(unsigned long clone_flags,
         */
        p->ioprio = current->ioprio;
-        SET_LINKS(p);
+        if (likely(p->pid)) {
-        if (unlikely(p->ptrace & PT_PTRACED))
+                add_parent(p);
-                __ptrace_link(p, current->parent);
+                if (unlikely(p->ptrace & PT_PTRACED))
+                        __ptrace_link(p, current->parent);
-        attach_pid(p, PIDTYPE_PID, p->pid);
-        attach_pid(p, PIDTYPE_TGID, p->tgid);
+                if (thread_group_leader(p)) {
-        if (thread_group_leader(p)) {
+                        p->signal->tty = current->signal->tty;
-                p->signal->tty = current->signal->tty;
+                        p->signal->pgrp = process_group(current);
-                p->signal->pgrp = process_group(current);
+                        p->signal->session = current->signal->session;
-                p->signal->session = current->signal->session;
+                        attach_pid(p, PIDTYPE_PGID, process_group(p));
-                attach_pid(p, PIDTYPE_PGID, process_group(p));
+                        attach_pid(p, PIDTYPE_SID, p->signal->session);
-                attach_pid(p, PIDTYPE_SID, p->signal->session);
-                if (p->pid)
+                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        __get_cpu_var(process_counts)++;
+                }
+                attach_pid(p, PIDTYPE_PID, p->pid);
+                nr_threads++;
        }
-        nr_threads++;
        total_forks++;
+        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        proc_fork_connector(p);
        return p;
@@ -1201,9 +1224,9 @@ bad_fork_cleanup_mm:
        if (p->mm)
                mmput(p->mm);
 bad_fork_cleanup_signal:
-        exit_signal(p);
+        cleanup_signal(p);
 bad_fork_cleanup_sighand:
-        exit_sighand(p);
+        __cleanup_sighand(p->sighand);
 bad_fork_cleanup_fs:
        exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
@@ -1250,7 +1273,7 @@ task_t * __devinit fork_idle(int cpu)
        if (!task)
                return ERR_PTR(-ENOMEM);
        init_idle(task, cpu);
-        unhash_process(task);
        return task;
 }
@@ -1285,17 +1308,19 @@ long do_fork(unsigned long clone_flags,
 {
        struct task_struct *p;
        int trace = 0;
-        long pid = alloc_pidmap();
+        struct pid *pid = alloc_pid();
+        long nr;
-        if (pid < 0)
+        if (!pid)
                return -EAGAIN;
+        nr = pid->nr;
        if (unlikely(current->ptrace)) {
                trace = fork_traceflag (clone_flags);
                if (trace)
                        clone_flags |= CLONE_PTRACE;
        }
-        p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
+        p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
@@ -1322,7 +1347,7 @@ long do_fork(unsigned long clone_flags,
                        p->state = TASK_STOPPED;
                if (unlikely (trace)) {
-                        current->ptrace_message = pid;
+                        current->ptrace_message = nr;
                        ptrace_notify ((trace << 8) | SIGTRAP);
                }
@@ -1332,21 +1357,31 @@ long do_fork(unsigned long clone_flags,
                                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
                }
        } else {
-                free_pidmap(pid);
+                free_pid(pid);
-                pid = PTR_ERR(p);
+                nr = PTR_ERR(p);
        }
-        return pid;
+        return nr;
 }
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
+static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+{
+        struct sighand_struct *sighand = data;
+        if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+                                        SLAB_CTOR_CONSTRUCTOR)
+                spin_lock_init(&sighand->siglock);
+}
 void __init proc_caches_init(void)
 {
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+                        sighand_ctor, NULL);
        signal_cachep = kmem_cache_create("signal_cache",
                        sizeof(struct signal_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
@@ -1471,9 +1506,7 @@ static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
        if ((unshare_flags & CLONE_VM) &&
            (mm && atomic_read(&mm->mm_users) > 1)) {
-                *new_mmp = dup_mm(current);
+                return -EINVAL;
-                if (!*new_mmp)
-                        return -ENOMEM;
        }
        return 0;
@@ -1529,6 +1562,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
        check_unshare_flags(&unshare_flags);
+        /* Return -EINVAL for all unsupported flags */
+        err = -EINVAL;
+        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
+                goto bad_unshare_out;
        if ((err = unshare_thread(unshare_flags)))
                goto bad_unshare_out;
        if ((err = unshare_fs(unshare_flags, &new_fs)))
@@ -1562,7 +1601,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                if (new_sigh) {
                        sigh = current->sighand;
-                        current->sighand = new_sigh;
+                        rcu_assign_pointer(current->sighand, new_sigh);
                        new_sigh = sigh;
                }
diff --git a/kernel/futex.c b/kernel/futex.c
index 5efa2f9780..5699c51205 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -8,6 +8,10 @@
 *  Removed page pinning, fix privately mapped COW pages and other cleanups
 *  (C) Copyright 2003, 2004 Jamie Lokier
 *
+ *  Robust futex support started by Ingo Molnar
+ *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+ *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+ *
 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
 *  enough at me, Linus for the original (flawed) idea, Matthew
 *  Kirkwood for proof-of-concept implementation.
@@ -829,6 +833,172 @@ error:
        goto out;
 }
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ *
+ * Implementation: user-space maintains a per-thread list of locks it
+ * is holding. Upon do_exit(), the kernel carefully walks this list,
+ * and marks all locks that are owned by this thread with the
+ * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
+ * always manipulated with the lock held, so the list is private and
+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+ * field, to allow the kernel to clean up if the thread dies after
+ * acquiring the lock, but just before it could have added itself to
+ * the list. There can only be one such pending lock.
+ */
+/**
+ * sys_set_robust_list - set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+asmlinkage long
+sys_set_robust_list(struct robust_list_head __user *head,
+                    size_t len)
+{
+        /*
+         * The kernel knows only one size for now:
+         */
+        if (unlikely(len != sizeof(*head)))
+                return -EINVAL;
+        current->robust_list = head;
+        return 0;
+}
+/**
+ * sys_get_robust_list - get the robust-futex list head of a task
+ * @pid: pid of the process [zero for current task]
+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
+ * @len_ptr: pointer to a length field, the kernel fills in the header size
+ */
+asmlinkage long
+sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+                    size_t __user *len_ptr)
+{
+        struct robust_list_head *head;
+        unsigned long ret;
+        if (!pid)
+                head = current->robust_list;
+        else {
+                struct task_struct *p;
+                ret = -ESRCH;
+                read_lock(&tasklist_lock);
+                p = find_task_by_pid(pid);
+                if (!p)
+                        goto err_unlock;
+                ret = -EPERM;
+                if ((current->euid != p->euid) && (current->euid != p->uid) &&
+                                !capable(CAP_SYS_PTRACE))
+                        goto err_unlock;
+                head = p->robust_list;
+                read_unlock(&tasklist_lock);
+        }
+        if (put_user(sizeof(*head), len_ptr))
+                return -EFAULT;
+        return put_user(head, head_ptr);
+err_unlock:
+        read_unlock(&tasklist_lock);
+        return ret;
+}
+/*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
+{
+        u32 uval;
+retry:
+        if (get_user(uval, uaddr))
+                return -1;
+        if ((uval & FUTEX_TID_MASK) == curr->pid) {
+                /*
+                 * Ok, this dying thread is truly holding a futex
+                 * of interest. Set the OWNER_DIED bit atomically
+                 * via cmpxchg, and if the value had FUTEX_WAITERS
+                 * set, wake up a waiter (if any). (We have to do a
+                 * futex_wake() even if OWNER_DIED is already set -
+                 * to handle the rare but possible case of recursive
+                 * thread-death.) The rest of the cleanup is done in
+                 * userspace.
+                 */
+                if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
+                                         uval | FUTEX_OWNER_DIED) != uval)
+                        goto retry;
+                if (uval & FUTEX_WAITERS)
+                        futex_wake((unsigned long)uaddr, 1);
+        }
+        return 0;
+}
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void exit_robust_list(struct task_struct *curr)
+{
+        struct robust_list_head __user *head = curr->robust_list;
+        struct robust_list __user *entry, *pending;
+        unsigned int limit = ROBUST_LIST_LIMIT;
+        unsigned long futex_offset;
+        /*
+         * Fetch the list head (which was registered earlier, via
+         * sys_set_robust_list()):
+         */
+        if (get_user(entry, &head->list.next))
+                return;
+        /*
+         * Fetch the relative futex offset:
+         */
+        if (get_user(futex_offset, &head->futex_offset))
+                return;
+        /*
+         * Fetch any possibly pending lock-add first, and handle it
+         * if it exists:
+         */
+        if (get_user(pending, &head->list_op_pending))
+                return;
+        if (pending)
+                handle_futex_death((void *)pending + futex_offset, curr);
+        while (entry != &head->list) {
+                /*
+                 * A pending lock might already be on the list, so
+                 * dont process it twice:
+                 */
+                if (entry != pending)
+                        if (handle_futex_death((void *)entry + futex_offset,
+                                                curr))
+                                return;
+                /*
+                 * Fetch the next entry in the list:
+                 */
+                if (get_user(entry, &entry->next))
+                        return;
+                /*
+                 * Avoid excessively long or circular lists:
+                 */
+                if (!--limit)
+                        break;
+                cond_resched();
+        }
+}
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
                unsigned long uaddr2, int val2, int val3)
 {
@@ -869,9 +1039,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
        int val2 = 0;
-        if ((op == FUTEX_WAIT) && utime) {
+        if (utime && (op == FUTEX_WAIT)) {
                if (copy_from_user(&t, utime, sizeof(t)) != 0)
                        return -EFAULT;
+                if (!timespec_valid(&t))
+                        return -EINVAL;
                timeout = timespec_to_jiffies(&t) + 1;
        }
        /*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
new file mode 100644
index 0000000000..1ab6a0ea3d
--- /dev/null
+++ b/kernel/futex_compat.c
@@ -0,0 +1,144 @@
+/*
+ * linux/kernel/futex_compat.c
+ *
+ * Futex compatibililty routines.
+ *
+ * Copyright 2006, Red Hat, Inc., Ingo Molnar
+ */
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/futex.h>
+#include <asm/uaccess.h>
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+        struct compat_robust_list_head __user *head = curr->compat_robust_list;
+        struct robust_list __user *entry, *pending;
+        compat_uptr_t uentry, upending;
+        unsigned int limit = ROBUST_LIST_LIMIT;
+        compat_long_t futex_offset;
+        /*
+         * Fetch the list head (which was registered earlier, via
+         * sys_set_robust_list()):
+         */
+        if (get_user(uentry, &head->list.next))
+                return;
+        entry = compat_ptr(uentry);
+        /*
+         * Fetch the relative futex offset:
+         */
+        if (get_user(futex_offset, &head->futex_offset))
+                return;
+        /*
+         * Fetch any possibly pending lock-add first, and handle it
+         * if it exists:
+         */
+        if (get_user(upending, &head->list_op_pending))
+                return;
+        pending = compat_ptr(upending);
+        if (upending)
+                handle_futex_death((void *)pending + futex_offset, curr);
+        while (compat_ptr(uentry) != &head->list) {
+                /*
+                 * A pending lock might already be on the list, so
+                 * dont process it twice:
+                 */
+                if (entry != pending)
+                        if (handle_futex_death((void *)entry + futex_offset,
+                                                curr))
+                                return;
+                /*
+                 * Fetch the next entry in the list:
+                 */
+                if (get_user(uentry, (compat_uptr_t *)&entry->next))
+                        return;
+                entry = compat_ptr(uentry);
+                /*
+                 * Avoid excessively long or circular lists:
+                 */
+                if (!--limit)
+                        break;
+                cond_resched();
+        }
+}
+asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+                           compat_size_t len)
+{
+        if (unlikely(len != sizeof(*head)))
+                return -EINVAL;
+        current->compat_robust_list = head;
+        return 0;
+}
+asmlinkage long
+compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
+                           compat_size_t __user *len_ptr)
+{
+        struct compat_robust_list_head *head;
+        unsigned long ret;
+        if (!pid)
+                head = current->compat_robust_list;
+        else {
+                struct task_struct *p;
+                ret = -ESRCH;
+                read_lock(&tasklist_lock);
+                p = find_task_by_pid(pid);
+                if (!p)
+                        goto err_unlock;
+                ret = -EPERM;
+                if ((current->euid != p->euid) && (current->euid != p->uid) &&
+                                !capable(CAP_SYS_PTRACE))
+                        goto err_unlock;
+                head = p->compat_robust_list;
+                read_unlock(&tasklist_lock);
+        }
+        if (put_user(sizeof(*head), len_ptr))
+                return -EFAULT;
+        return put_user(ptr_to_compat(head), head_ptr);
+err_unlock:
+        read_unlock(&tasklist_lock);
+        return ret;
+}
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
+                struct compat_timespec __user *utime, u32 __user *uaddr2,
+                u32 val3)
+{
+        struct timespec t;
+        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+        int val2 = 0;
+        if (utime && (op == FUTEX_WAIT)) {
+                if (get_compat_timespec(&t, utime))
+                        return -EFAULT;
+                if (!timespec_valid(&t))
+                        return -EINVAL;
+                timeout = timespec_to_jiffies(&t) + 1;
+        }
+        if (op >= FUTEX_REQUEUE)
+                val2 = (int) (unsigned long) utime;
+        return do_futex((unsigned long)uaddr, op, val, timeout,
+                        (unsigned long)uaddr2, val2, val3);
+}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2b6e1757ae..01fa2ae98a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -123,6 +123,26 @@ void ktime_get_ts(struct timespec *ts)
 EXPORT_SYMBOL_GPL(ktime_get_ts);
 /*
+ * Get the coarse grained time at the softirq based on xtime and
+ * wall_to_monotonic.
+ */
+static void hrtimer_get_softirq_time(struct hrtimer_base *base)
+{
+        ktime_t xtim, tomono;
+        unsigned long seq;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                xtim = timespec_to_ktime(xtime);
+                tomono = timespec_to_ktime(wall_to_monotonic);
+        } while (read_seqretry(&xtime_lock, seq));
+        base[CLOCK_REALTIME].softirq_time = xtim;
+        base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono);
+}
+/*
 * Functions and macros which are different for UP/SMP systems are kept in a
 * single place
 */
@@ -246,7 +266,7 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
 /*
 * Divide a ktime value by a nanosecond value
 */
-static unsigned long ktime_divns(const ktime_t kt, nsec_t div)
+static unsigned long ktime_divns(const ktime_t kt, s64 div)
 {
        u64 dclc, inc, dns;
        int sft = 0;
@@ -281,18 +301,17 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 * hrtimer_forward - forward the timer expiry
 *
 * @timer:      hrtimer to forward
+ * @now:        forward past this time
 * @interval:   the interval to forward
 *
 * Forward the timer expiry so it will expire in the future.
 * Returns the number of overruns.
 */
 unsigned long
-hrtimer_forward(struct hrtimer *timer, ktime_t interval)
+hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 {
        unsigned long orun = 1;
-        ktime_t delta, now;
+        ktime_t delta;
-        now = timer->base->get_time();
        delta = ktime_sub(now, timer->expires);
@@ -303,7 +322,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t interval)
                interval.tv64 = timer->base->resolution.tv64;
        if (unlikely(delta.tv64 >= interval.tv64)) {
-                nsec_t incr = ktime_to_ns(interval);
+                s64 incr = ktime_to_ns(interval);
                orun = ktime_divns(delta, incr);
                timer->expires = ktime_add_ns(timer->expires, incr * orun);
@@ -355,8 +374,6 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
        rb_link_node(&timer->node, parent, link);
        rb_insert_color(&timer->node, &base->active);
-        timer->state = HRTIMER_PENDING;
        if (!base->first || timer->expires.tv64 <
            rb_entry(base->first, struct hrtimer, node)->expires.tv64)
                base->first = &timer->node;
@@ -376,6 +393,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
        if (base->first == &timer->node)
                base->first = rb_next(&timer->node);
        rb_erase(&timer->node, &base->active);
+        timer->node.rb_parent = HRTIMER_INACTIVE;
 }
 /*
@@ -386,7 +404,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 {
        if (hrtimer_active(timer)) {
                __remove_hrtimer(timer, base);
-                timer->state = HRTIMER_INACTIVE;
                return 1;
        }
        return 0;
@@ -418,8 +435,19 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
        /* Switch the timer base, if necessary: */
        new_base = switch_hrtimer_base(timer, base);
-        if (mode == HRTIMER_REL)
+        if (mode == HRTIMER_REL) {
                tim = ktime_add(tim, new_base->get_time());
+                /*
+                 * CONFIG_TIME_LOW_RES is a temporary way for architectures
+                 * to signal that they simply return xtime in
+                 * do_gettimeoffset(). In this case we want to round up by
+                 * resolution when starting a relative timer, to avoid short
+                 * timeouts. This will go away with the GTOD framework.
+                 */
+#ifdef CONFIG_TIME_LOW_RES
+                tim = ktime_add(tim, base->resolution);
+#endif
+        }
        timer->expires = tim;
        enqueue_hrtimer(timer, new_base);
@@ -428,6 +456,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
        return ret;
 }
+EXPORT_SYMBOL_GPL(hrtimer_start);
 /**
 * hrtimer_try_to_cancel - try to deactivate a timer
@@ -456,6 +485,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
        return ret;
 }
+EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
 /**
 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
@@ -473,8 +503,10 @@ int hrtimer_cancel(struct hrtimer *timer)
                if (ret >= 0)
                        return ret;
+                cpu_relax();
        }
 }
+EXPORT_SYMBOL_GPL(hrtimer_cancel);
 /**
 * hrtimer_get_remaining - get remaining time for the timer
@@ -493,6 +525,42 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
        return rem;
 }
+EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
+#ifdef CONFIG_NO_IDLE_HZ
+/**
+ * hrtimer_get_next_event - get the time until next expiry event
+ *
+ * Returns the delta to the next expiry event or KTIME_MAX if no timer
+ * is pending.
+ */
+ktime_t hrtimer_get_next_event(void)
+{
+        struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
+        ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
+        unsigned long flags;
+        int i;
+        for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
+                struct hrtimer *timer;
+                spin_lock_irqsave(&base->lock, flags);
+                if (!base->first) {
+                        spin_unlock_irqrestore(&base->lock, flags);
+                        continue;
+                }
+                timer = rb_entry(base->first, struct hrtimer, node);
+                delta.tv64 = timer->expires.tv64;
+                spin_unlock_irqrestore(&base->lock, flags);
+                delta = ktime_sub(delta, base->get_time());
+                if (delta.tv64 < mindelta.tv64)
+                        mindelta.tv64 = delta.tv64;
+        }
+        if (mindelta.tv64 < 0)
+                mindelta.tv64 = 0;
+        return mindelta;
+}
+#endif
 /**
 * hrtimer_init - initialize a timer to the given clock
@@ -514,7 +582,9 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                clock_id = CLOCK_MONOTONIC;
        timer->base = &bases[clock_id];
+        timer->node.rb_parent = HRTIMER_INACTIVE;
 }
+EXPORT_SYMBOL_GPL(hrtimer_init);
 /**
 * hrtimer_get_res - get the timer resolution for a clock
@@ -534,54 +604,45 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
        return 0;
 }
+EXPORT_SYMBOL_GPL(hrtimer_get_res);
 /*
 * Expire the per base hrtimer-queue:
 */
 static inline void run_hrtimer_queue(struct hrtimer_base *base)
 {
-        ktime_t now = base->get_time();
        struct rb_node *node;
+        if (!base->first)
+                return;
+        if (base->get_softirq_time)
+                base->softirq_time = base->get_softirq_time();
        spin_lock_irq(&base->lock);
        while ((node = base->first)) {
                struct hrtimer *timer;
-                int (*fn)(void *);
+                int (*fn)(struct hrtimer *);
                int restart;
-                void *data;
                timer = rb_entry(node, struct hrtimer, node);
-                if (now.tv64 <= timer->expires.tv64)
+                if (base->softirq_time.tv64 <= timer->expires.tv64)
                        break;
                fn = timer->function;
-                data = timer->data;
                set_curr_timer(base, timer);
-                timer->state = HRTIMER_RUNNING;
                __remove_hrtimer(timer, base);
                spin_unlock_irq(&base->lock);
-                /*
+                restart = fn(timer);
-                 * fn == NULL is special case for the simplest timer
-                 * variant - wake up process and do not restart:
-                 */
-                if (!fn) {
-                        wake_up_process(data);
-                        restart = HRTIMER_NORESTART;
-                } else
-                        restart = fn(data);
                spin_lock_irq(&base->lock);
-                /* Another CPU has added back the timer */
+                if (restart != HRTIMER_NORESTART) {
-                if (timer->state != HRTIMER_RUNNING)
+                        BUG_ON(hrtimer_active(timer));
-                        continue;
-                if (restart == HRTIMER_RESTART)
                        enqueue_hrtimer(timer, base);
-                else
+                }
-                        timer->state = HRTIMER_EXPIRED;
        }
        set_curr_timer(base, NULL);
        spin_unlock_irq(&base->lock);
@@ -595,6 +656,8 @@ void hrtimer_run_queues(void)
        struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
        int i;
+        hrtimer_get_softirq_time(base);
        for (i = 0; i < MAX_HRTIMER_BASES; i++)
                run_hrtimer_queue(&base[i]);
 }
@@ -602,80 +665,69 @@ void hrtimer_run_queues(void)
 /*
 * Sleep related functions:
 */
+static int hrtimer_wakeup(struct hrtimer *timer)
-/**
- * schedule_hrtimer - sleep until timeout
- *
- * @timer:      hrtimer variable initialized with the correct clock base
- * @mode:       timeout value is abs/rel
- *
- * Make the current task sleep until @timeout is
- * elapsed.
- *
- * You can set the task state as follows -
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
- * pass before the routine returns. The routine will return 0
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task. In this case the remaining time
- * will be returned
- *
- * The current task state is guaranteed to be TASK_RUNNING when this
- * routine returns.
- */
-static ktime_t __sched
-schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
 {
-        /* fn stays NULL, meaning single-shot wakeup: */
+        struct hrtimer_sleeper *t =
-        timer->data = current;
+                container_of(timer, struct hrtimer_sleeper, timer);
+        struct task_struct *task = t->task;
-        hrtimer_start(timer, timer->expires, mode);
+        t->task = NULL;
+        if (task)
+                wake_up_process(task);
-        schedule();
+        return HRTIMER_NORESTART;
-        hrtimer_cancel(timer);
+}
-        /* Return the remaining time: */
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
-        if (timer->state != HRTIMER_EXPIRED)
+{
-                return ktime_sub(timer->expires, timer->base->get_time());
+        sl->timer.function = hrtimer_wakeup;
-        else
+        sl->task = task;
-                return (ktime_t) {.tv64 = 0 };
 }
-static inline ktime_t __sched
+static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
-schedule_hrtimer_interruptible(struct hrtimer *timer,
-                               const enum hrtimer_mode mode)
 {
-        set_current_state(TASK_INTERRUPTIBLE);
+        hrtimer_init_sleeper(t, current);
+        do {
+                set_current_state(TASK_INTERRUPTIBLE);
+                hrtimer_start(&t->timer, t->timer.expires, mode);
+                schedule();
+                hrtimer_cancel(&t->timer);
+                mode = HRTIMER_ABS;
-        return schedule_hrtimer(timer, mode);
+        } while (t->task && !signal_pending(current));
+        return t->task == NULL;
 }
 static long __sched nanosleep_restart(struct restart_block *restart)
 {
+        struct hrtimer_sleeper t;
        struct timespec __user *rmtp;
        struct timespec tu;
-        void *rfn_save = restart->fn;
+        ktime_t time;
-        struct hrtimer timer;
-        ktime_t rem;
        restart->fn = do_no_restart_syscall;
-        hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS);
+        hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
+        t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
-        timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
-        rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);
-        if (rem.tv64 <= 0)
+        if (do_nanosleep(&t, HRTIMER_ABS))
                return 0;
        rmtp = (struct timespec __user *) restart->arg2;
-        tu = ktime_to_timespec(rem);
+        if (rmtp) {
-        if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
+                time = ktime_sub(t.timer.expires, t.timer.base->get_time());
-                return -EFAULT;
+                if (time.tv64 <= 0)
+                        return 0;
+                tu = ktime_to_timespec(time);
+                if (copy_to_user(rmtp, &tu, sizeof(tu)))
+                        return -EFAULT;
+        }
-        restart->fn = rfn_save;
+        restart->fn = nanosleep_restart;
        /* The other values in restart are already filled in */
        return -ERESTART_RESTARTBLOCK;
@@ -685,33 +737,34 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
                       const enum hrtimer_mode mode, const clockid_t clockid)
 {
        struct restart_block *restart;
-        struct hrtimer timer;
+        struct hrtimer_sleeper t;
        struct timespec tu;
        ktime_t rem;
-        hrtimer_init(&timer, clockid, mode);
+        hrtimer_init(&t.timer, clockid, mode);
+        t.timer.expires = timespec_to_ktime(*rqtp);
-        timer.expires = timespec_to_ktime(*rqtp);
+        if (do_nanosleep(&t, mode))
-        rem = schedule_hrtimer_interruptible(&timer, mode);
-        if (rem.tv64 <= 0)
                return 0;
        /* Absolute timers do not update the rmtp value and restart: */
        if (mode == HRTIMER_ABS)
                return -ERESTARTNOHAND;
-        tu = ktime_to_timespec(rem);
+        if (rmtp) {
+                rem = ktime_sub(t.timer.expires, t.timer.base->get_time());
-        if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
+                if (rem.tv64 <= 0)
-                return -EFAULT;
+                        return 0;
+                tu = ktime_to_timespec(rem);
+                if (copy_to_user(rmtp, &tu, sizeof(tu)))
+                        return -EFAULT;
+        }
        restart = &current_thread_info()->restart_block;
        restart->fn = nanosleep_restart;
-        restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
+        restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
-        restart->arg1 = timer.expires.tv64 >> 32;
+        restart->arg1 = t.timer.expires.tv64 >> 32;
        restart->arg2 = (unsigned long) rmtp;
-        restart->arg3 = (unsigned long) timer.base->index;
+        restart->arg3 = (unsigned long) t.timer.base->index;
        return -ERESTART_RESTARTBLOCK;
 }
@@ -789,7 +842,7 @@ static void migrate_hrtimers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
+static int hrtimer_cpu_notify(struct notifier_block *self,
                                        unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -813,7 +866,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata hrtimers_nb = {
+static struct notifier_block hrtimers_nb = {
        .notifier_call = hrtimer_cpu_notify,
 };
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 49378738ff..9f77f50d81 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -2,4 +2,4 @@
 obj-y := handle.o manage.o spurious.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 97d5559997..1279e34995 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -204,10 +204,14 @@ int setup_irq(unsigned int irq, struct irqaction * new)
        p = &desc->action;
        if ((old = *p) != NULL) {
                /* Can't share interrupts unless both agree to */
-                if (!(old->flags & new->flags & SA_SHIRQ)) {
+                if (!(old->flags & new->flags & SA_SHIRQ))
-                        spin_unlock_irqrestore(&desc->lock,flags);
+                        goto mismatch;
-                        return -EBUSY;
-                }
+#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
+                /* All handlers must agree on per-cpuness */
+                if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU))
+                        goto mismatch;
+#endif
                /* add new interrupt at end of irq queue */
                do {
@@ -218,7 +222,10 @@ int setup_irq(unsigned int irq, struct irqaction * new)
        }
        *p = new;
+#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
+        if (new->flags & SA_PERCPU_IRQ)
+                desc->status |= IRQ_PER_CPU;
+#endif
        if (!shared) {
                desc->depth = 0;
                desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
@@ -236,6 +243,14 @@ int setup_irq(unsigned int irq, struct irqaction * new)
        register_handler_proc(irq, new);
        return 0;
+mismatch:
+        spin_unlock_irqrestore(&desc->lock, flags);
+        if (!(new->flags & SA_PROBEIRQ)) {
+                printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
+                dump_stack();
+        }
+        return -EBUSY;
 }
 /**
@@ -258,6 +273,7 @@ void free_irq(unsigned int irq, void *dev_id)
        struct irqaction **p;
        unsigned long flags;
+        WARN_ON(in_interrupt());
        if (irq >= NR_IRQS)
                return;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
new file mode 100644
index 0000000000..134f9f2e0e
--- /dev/null
+++ b/kernel/irq/migration.c
@@ -0,0 +1,62 @@
+#include <linux/irq.h>
+void set_pending_irq(unsigned int irq, cpumask_t mask)
+{
+        irq_desc_t *desc = irq_desc + irq;
+        unsigned long flags;
+        spin_lock_irqsave(&desc->lock, flags);
+        desc->move_irq = 1;
+        pending_irq_cpumask[irq] = mask;
+        spin_unlock_irqrestore(&desc->lock, flags);
+}
+void move_native_irq(int irq)
+{
+        cpumask_t tmp;
+        irq_desc_t *desc = irq_descp(irq);
+        if (likely(!desc->move_irq))
+                return;
+        /*
+         * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
+         */
+        if (CHECK_IRQ_PER_CPU(desc->status)) {
+                WARN_ON(1);
+                return;
+        }
+        desc->move_irq = 0;
+        if (likely(cpus_empty(pending_irq_cpumask[irq])))
+                return;
+        if (!desc->handler->set_affinity)
+                return;
+        assert_spin_locked(&desc->lock);
+        cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
+        /*
+         * If there was a valid mask to work with, please
+         * do the disable, re-program, enable sequence.
+         * This is *not* particularly important for level triggered
+         * but in a edge trigger case, we might be setting rte
+         * when an active trigger is comming in. This could
+         * cause some ioapics to mal-function.
+         * Being paranoid i guess!
+         */
+        if (unlikely(!cpus_empty(tmp))) {
+                if (likely(!(desc->status & IRQ_DISABLED)))
+                        desc->handler->disable(irq);
+                desc->handler->set_affinity(irq,tmp);
+                if (likely(!(desc->status & IRQ_DISABLED)))
+                        desc->handler->enable(irq);
+        }
+        cpus_clear(pending_irq_cpumask[irq]);
+}
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 379be2f8c8..204ed7939e 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,21 +128,75 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
 /*
 * The timer is automagically restarted, when interval != 0
 */
-int it_real_fn(void *data)
+int it_real_fn(struct hrtimer *timer)
 {
-        struct task_struct *tsk = (struct task_struct *) data;
+        struct signal_struct *sig =
+            container_of(timer, struct signal_struct, real_timer);
-        send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk);
+        send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
-        if (tsk->signal->it_real_incr.tv64 != 0) {
-                hrtimer_forward(&tsk->signal->real_timer,
-                               tsk->signal->it_real_incr);
+        if (sig->it_real_incr.tv64 != 0) {
+                hrtimer_forward(timer, timer->base->softirq_time,
+                                sig->it_real_incr);
                return HRTIMER_RESTART;
        }
        return HRTIMER_NORESTART;
 }
+/*
+ * We do not care about correctness. We just sanitize the values so
+ * the ktime_t operations which expect normalized values do not
+ * break. This converts negative values to long timeouts similar to
+ * the code in kernel versions < 2.6.16
+ *
+ * Print a limited number of warning messages when an invalid timeval
+ * is detected.
+ */
+static void fixup_timeval(struct timeval *tv, int interval)
+{
+        static int warnlimit = 10;
+        unsigned long tmp;
+        if (warnlimit > 0) {
+                warnlimit--;
+                printk(KERN_WARNING
+                       "setitimer: %s (pid = %d) provided "
+                       "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n",
+                       current->comm, current->pid,
+                       interval ? "it_interval" : "it_value",
+                       tv->tv_sec, (long) tv->tv_usec);
+        }
+        tmp = tv->tv_usec;
+        if (tmp >= USEC_PER_SEC) {
+                tv->tv_usec = tmp % USEC_PER_SEC;
+                tv->tv_sec += tmp / USEC_PER_SEC;
+        }
+        tmp = tv->tv_sec;
+        if (tmp > LONG_MAX)
+                tv->tv_sec = LONG_MAX;
+}
+/*
+ * Returns true if the timeval is in canonical form
+ */
+#define timeval_valid(t) \
+        (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
+/*
+ * Check for invalid timevals, sanitize them and print a limited
+ * number of warnings.
+ */
+static void check_itimerval(struct itimerval *value) {
+        if (unlikely(!timeval_valid(&value->it_value)))
+                fixup_timeval(&value->it_value, 0);
+        if (unlikely(!timeval_valid(&value->it_interval)))
+                fixup_timeval(&value->it_interval, 1);
+}
 int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
        struct task_struct *tsk = current;
@@ -150,6 +204,18 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
        ktime_t expires;
        cputime_t cval, cinterval, nval, ninterval;
+        /*
+         * Validate the timevals in value.
+         *
+         * Note: Although the spec requires that invalid values shall
+         * return -EINVAL, we just fixup the value and print a limited
+         * number of warnings in order not to break users of this
+         * historical misfeature.
+         *
+         * Scheduled for replacement in March 2007
+         */
+        check_itimerval(value);
        switch (which) {
        case ITIMER_REAL:
 again:
@@ -226,6 +292,43 @@ again:
        return 0;
 }
+/**
+ * alarm_setitimer - set alarm in seconds
+ *
+ * @seconds:    number of seconds until alarm
+ *              0 disables the alarm
+ *
+ * Returns the remaining time in seconds of a pending timer or 0 when
+ * the timer is not active.
+ *
+ * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
+ * negative timeval settings which would cause immediate expiry.
+ */
+unsigned int alarm_setitimer(unsigned int seconds)
+{
+        struct itimerval it_new, it_old;
+#if BITS_PER_LONG < 64
+        if (seconds > INT_MAX)
+                seconds = INT_MAX;
+#endif
+        it_new.it_value.tv_sec = seconds;
+        it_new.it_value.tv_usec = 0;
+        it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+        do_setitimer(ITIMER_REAL, &it_new, &it_old);
+        /*
+         * We can't return 0 if we have an alarm pending ...  And we'd
+         * better return too much than too little anyway
+         */
+        if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
+              it_old.it_value.tv_usec >= 500000)
+                it_old.it_value.tv_sec++;
+        return it_old.it_value.tv_sec;
+}
 asmlinkage long sys_setitimer(int which,
                              struct itimerval __user *value,
                              struct itimerval __user *ovalue)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 51a892063a..20a997c73c 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -170,7 +170,7 @@ static int wait_for_helper(void *data)
        sa.sa.sa_handler = SIG_IGN;
        sa.sa.sa_flags = 0;
        siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
-        do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+        do_sigaction(SIGCHLD, &sa, NULL);
        allow_signal(SIGCHLD);
        pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fef1af8a73..1fbf466a29 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,7 +48,7 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
-DECLARE_MUTEX(kprobe_mutex);            /* Protects kprobe_table */
+DEFINE_MUTEX(kprobe_mutex);             /* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);        /* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -323,10 +323,10 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
 }
 /*
- * This function is called from exit_thread or flush_thread when task tk's
+ * This function is called from finish_task_switch when task tk becomes dead,
- * stack is being recycled so that we can recycle any function-return probe
+ * so that we can recycle any function-return probe instances associated
- * instances associated with this task. These left over instances represent
+ * with this task. These left over instances represent probed functions
- * probed functions that have been called but will never return.
+ * that have been called but will never return.
 */
 void __kprobes kprobe_flush_task(struct task_struct *tk)
 {
@@ -336,7 +336,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
        unsigned long flags = 0;
        spin_lock_irqsave(&kretprobe_lock, flags);
-        head = kretprobe_inst_table_head(current);
+        head = kretprobe_inst_table_head(tk);
        hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
                if (ri->task == tk)
                        recycle_rp_inst(ri);
@@ -460,7 +460,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
        }
        p->nmissed = 0;
-        down(&kprobe_mutex);
+        mutex_lock(&kprobe_mutex);
        old_p = get_kprobe(p->addr);
        if (old_p) {
                ret = register_aggr_kprobe(old_p, p);
@@ -477,7 +477,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
        arch_arm_kprobe(p);
 out:
-        up(&kprobe_mutex);
+        mutex_unlock(&kprobe_mutex);
        if (ret && probed_mod)
                module_put(probed_mod);
@@ -496,10 +496,10 @@ void __kprobes unregister_kprobe(struct kprobe *p)
        struct kprobe *old_p, *list_p;
        int cleanup_p;
-        down(&kprobe_mutex);
+        mutex_lock(&kprobe_mutex);
        old_p = get_kprobe(p->addr);
        if (unlikely(!old_p)) {
-                up(&kprobe_mutex);
+                mutex_unlock(&kprobe_mutex);
                return;
        }
        if (p != old_p) {
@@ -507,7 +507,7 @@ void __kprobes unregister_kprobe(struct kprobe *p)
                        if (list_p == p)
                        /* kprobe p is a valid probe */
                                goto valid_p;
-                up(&kprobe_mutex);
+                mutex_unlock(&kprobe_mutex);
                return;
        }
 valid_p:
@@ -523,7 +523,7 @@ valid_p:
                cleanup_p = 0;
        }
-        up(&kprobe_mutex);
+        mutex_unlock(&kprobe_mutex);
        synchronize_sched();
        if (p->mod_refcounted &&
@@ -585,6 +585,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
        int i;
        rp->kp.pre_handler = pre_handler_kretprobe;
+        rp->kp.post_handler = NULL;
+        rp->kp.fault_handler = NULL;
+        rp->kp.break_handler = NULL;
        /* Pre-allocate memory for max kretprobe instances */
        if (rp->maxactive <= 0) {
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d5eeae0fa5..f119e098e6 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -15,9 +15,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
-u64 uevent_seqnum;
-char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
 #define KERNEL_ATTR_RO(_name) \
 static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -25,7 +22,7 @@ static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
 static struct subsys_attribute _name##_attr = \
        __ATTR(_name, 0644, _name##_show, _name##_store)
-#ifdef CONFIG_HOTPLUG
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 /* current uevent sequence number */
 static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
 {
@@ -55,7 +52,7 @@ decl_subsys(kernel, NULL, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);
 static struct attribute * kernel_attrs[] = {
-#ifdef CONFIG_HOTPLUG
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
        &uevent_seqnum_attr.attr,
        &uevent_helper_attr.attr,
 #endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index e75950a109..c5f3c6613b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,6 +12,7 @@
 #include <linux/unistd.h>
 #include <linux/file.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <asm/semaphore.h>
 /*
@@ -41,7 +42,7 @@ struct kthread_stop_info
 /* Thread stopping is done by setthing this var: lock serializes
 * multiple kthread_stop calls. */
-static DECLARE_MUTEX(kthread_stop_lock);
+static DEFINE_MUTEX(kthread_stop_lock);
 static struct kthread_stop_info kthread_stop_info;
 int kthread_should_stop(void)
@@ -114,7 +115,9 @@ static void keventd_create_kthread(void *_create)
                create->result = ERR_PTR(pid);
        } else {
                wait_for_completion(&create->started);
+                read_lock(&tasklist_lock);
                create->result = find_task_by_pid(pid);
+                read_unlock(&tasklist_lock);
        }
        complete(&create->done);
 }
@@ -173,7 +176,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 {
        int ret;
-        down(&kthread_stop_lock);
+        mutex_lock(&kthread_stop_lock);
        /* It could exit after stop_info.k set, but before wake_up_process. */
        get_task_struct(k);
@@ -194,7 +197,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
        wait_for_completion(&kthread_stop_info.done);
        kthread_stop_info.k = NULL;
        ret = kthread_stop_info.err;
-        up(&kthread_stop_lock);
+        mutex_unlock(&kthread_stop_lock);
        return ret;
 }
diff --git a/kernel/module.c b/kernel/module.c
index 5aad477ddc..bbe04862e1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -39,6 +39,7 @@
 #include <linux/device.h>
 #include <linux/string.h>
 #include <linux/sched.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
@@ -60,29 +61,20 @@
 static DEFINE_SPINLOCK(modlist_lock);
 /* List of modules, protected by module_mutex AND modlist_lock */
-static DECLARE_MUTEX(module_mutex);
+static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
-static DECLARE_MUTEX(notify_mutex);
+static BLOCKING_NOTIFIER_HEAD(module_notify_list);
-static struct notifier_block * module_notify_list;
 int register_module_notifier(struct notifier_block * nb)
 {
-        int err;
+        return blocking_notifier_chain_register(&module_notify_list, nb);
-        down(&notify_mutex);
-        err = notifier_chain_register(&module_notify_list, nb);
-        up(&notify_mutex);
-        return err;
 }
 EXPORT_SYMBOL(register_module_notifier);
 int unregister_module_notifier(struct notifier_block * nb)
 {
-        int err;
+        return blocking_notifier_chain_unregister(&module_notify_list, nb);
-        down(&notify_mutex);
-        err = notifier_chain_unregister(&module_notify_list, nb);
-        up(&notify_mutex);
-        return err;
 }
 EXPORT_SYMBOL(unregister_module_notifier);
@@ -126,15 +118,30 @@ extern const struct kernel_symbol __start___ksymtab[];
 extern const struct kernel_symbol __stop___ksymtab[];
 extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const struct kernel_symbol __start___ksymtab_gpl_future[];
+extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const unsigned long __start___kcrctab[];
 extern const unsigned long __start___kcrctab_gpl[];
+extern const unsigned long __start___kcrctab_gpl_future[];
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
 #else
-#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL)
+#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+        const struct kernel_symbol *start,
+        const struct kernel_symbol *stop)
+{
+        const struct kernel_symbol *ks = start;
+        for (; ks < stop; ks++)
+                if (strcmp(ks->name, name) == 0)
+                        return ks;
+        return NULL;
+}
 /* Find a symbol, return value, crc and module which owns it */
 static unsigned long __find_symbol(const char *name,
                                   struct module **owner,
@@ -142,64 +149,81 @@ static unsigned long __find_symbol(const char *name,
                                   int gplok)
 {
        struct module *mod;
-        unsigned int i;
+        const struct kernel_symbol *ks;
        /* Core kernel first. */ 
        *owner = NULL;
-        for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) {
+        ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
-                if (strcmp(__start___ksymtab[i].name, name) == 0) {
+        if (ks) {
-                        *crc = symversion(__start___kcrctab, i);
+                *crc = symversion(__start___kcrctab, (ks - __start___ksymtab));
-                        return __start___ksymtab[i].value;
+                return ks->value;
-                }
        }
        if (gplok) {
-                for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++)
+                ks = lookup_symbol(name, __start___ksymtab_gpl,
-                        if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) {
+                                         __stop___ksymtab_gpl);
-                                *crc = symversion(__start___kcrctab_gpl, i);
+                if (ks) {
-                                return __start___ksymtab_gpl[i].value;
+                        *crc = symversion(__start___kcrctab_gpl,
-                        }
+                                          (ks - __start___ksymtab_gpl));
+                        return ks->value;
+                }
+        }
+        ks = lookup_symbol(name, __start___ksymtab_gpl_future,
+                                 __stop___ksymtab_gpl_future);
+        if (ks) {
+                if (!gplok) {
+                        printk(KERN_WARNING "Symbol %s is being used "
+                               "by a non-GPL module, which will not "
+                               "be allowed in the future\n", name);
+                        printk(KERN_WARNING "Please see the file "
+                               "Documentation/feature-removal-schedule.txt "
+                               "in the kernel source tree for more "
+                               "details.\n");
+                }
+                *crc = symversion(__start___kcrctab_gpl_future,
+                                  (ks - __start___ksymtab_gpl_future));
+                return ks->value;
        }
        /* Now try modules. */ 
        list_for_each_entry(mod, &modules, list) {
                *owner = mod;
-                for (i = 0; i < mod->num_syms; i++)
+                ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
-                        if (strcmp(mod->syms[i].name, name) == 0) {
+                if (ks) {
-                                *crc = symversion(mod->crcs, i);
+                        *crc = symversion(mod->crcs, (ks - mod->syms));
-                                return mod->syms[i].value;
+                        return ks->value;
-                        }
+                }
                if (gplok) {
-                        for (i = 0; i < mod->num_gpl_syms; i++) {
+                        ks = lookup_symbol(name, mod->gpl_syms,
-                                if (strcmp(mod->gpl_syms[i].name, name) == 0) {
+                                           mod->gpl_syms + mod->num_gpl_syms);
-                                        *crc = symversion(mod->gpl_crcs, i);
+                        if (ks) {
-                                        return mod->gpl_syms[i].value;
+                                *crc = symversion(mod->gpl_crcs,
-                                }
+                                                  (ks - mod->gpl_syms));
+                                return ks->value;
                        }
                }
+                ks = lookup_symbol(name, mod->gpl_future_syms,
+                                   (mod->gpl_future_syms +
+                                    mod->num_gpl_future_syms));
+                if (ks) {
+                        if (!gplok) {
+                                printk(KERN_WARNING "Symbol %s is being used "
+                                       "by a non-GPL module, which will not "
+                                       "be allowed in the future\n", name);
+                                printk(KERN_WARNING "Please see the file "
+                                       "Documentation/feature-removal-schedule.txt "
+                                       "in the kernel source tree for more "
+                                       "details.\n");
+                        }
+                        *crc = symversion(mod->gpl_future_crcs,
+                                          (ks - mod->gpl_future_syms));
+                        return ks->value;
+                }
        }
        DEBUGP("Failed to find symbol %s\n", name);
        return 0;
 }
-/* Find a symbol in this elf symbol table */
-static unsigned long find_local_symbol(Elf_Shdr *sechdrs,
-                                       unsigned int symindex,
-                                       const char *strtab,
-                                       const char *name)
-{
-        unsigned int i;
-        Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
-        /* Search (defined) internal symbols first. */
-        for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) {
-                if (sym[i].st_shndx != SHN_UNDEF
-                    && strcmp(name, strtab + sym[i].st_name) == 0)
-                        return sym[i].st_value;
-        }
-        return 0;
-}
 /* Search for module by name: must hold module_mutex. */
 static struct module *find_module(const char *name)
 {
@@ -379,7 +403,6 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
 }
 #endif /* CONFIG_SMP */
-#ifdef CONFIG_MODULE_UNLOAD
 #define MODINFO_ATTR(field)     \
 static void setup_modinfo_##field(struct module *mod, const char *s)  \
 {                                                                     \
@@ -411,12 +434,7 @@ static struct module_attribute modinfo_##field = {                    \
 MODINFO_ATTR(version);
 MODINFO_ATTR(srcversion);
-static struct module_attribute *modinfo_attrs[] = {
+#ifdef CONFIG_MODULE_UNLOAD
-        &modinfo_version,
-        &modinfo_srcversion,
-        NULL,
-};
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
@@ -557,7 +575,7 @@ static void free_module(struct module *mod);
 static void wait_for_zero_refcount(struct module *mod)
 {
        /* Since we might sleep for some time, drop the semaphore first */
-        up(&module_mutex);
+        mutex_unlock(&module_mutex);
        for (;;) {
                DEBUGP("Looking at refcount...\n");
                set_current_state(TASK_UNINTERRUPTIBLE);
@@ -566,7 +584,7 @@ static void wait_for_zero_refcount(struct module *mod)
                schedule();
        }
        current->state = TASK_RUNNING;
-        down(&module_mutex);
+        mutex_lock(&module_mutex);
 }
 asmlinkage long
@@ -583,7 +601,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
                return -EFAULT;
        name[MODULE_NAME_LEN-1] = '\0';
-        if (down_interruptible(&module_mutex) != 0)
+        if (mutex_lock_interruptible(&module_mutex) != 0)
                return -EINTR;
        mod = find_module(name);
@@ -632,14 +650,14 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
        /* Final destruction now noone is using it. */
        if (mod->exit != NULL) {
-                up(&module_mutex);
+                mutex_unlock(&module_mutex);
                mod->exit();
-                down(&module_mutex);
+                mutex_lock(&module_mutex);
        }
        free_module(mod);
 out:
-        up(&module_mutex);
+        mutex_unlock(&module_mutex);
        return ret;
 }
@@ -687,14 +705,14 @@ EXPORT_SYMBOL(__symbol_put);
 void symbol_put_addr(void *addr)
 {
-        unsigned long flags;
+        struct module *modaddr;
-        spin_lock_irqsave(&modlist_lock, flags);
+        if (core_kernel_text((unsigned long)addr))
-        if (!kernel_text_address((unsigned long)addr))
+                return;
-                BUG();
-        module_put(module_text_address((unsigned long)addr));
+        if (!(modaddr = module_text_address((unsigned long)addr)))
-        spin_unlock_irqrestore(&modlist_lock, flags);
+                BUG();
+        module_put(modaddr);
 }
 EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -731,138 +749,14 @@ static inline void module_unload_init(struct module *mod)
 }
 #endif /* CONFIG_MODULE_UNLOAD */
-#ifdef CONFIG_OBSOLETE_MODPARM
+static struct module_attribute *modinfo_attrs[] = {
-/* Bounds checking done below */
+        &modinfo_version,
-static int obsparm_copy_string(const char *val, struct kernel_param *kp)
+        &modinfo_srcversion,
-{
+#ifdef CONFIG_MODULE_UNLOAD
-        strcpy(kp->arg, val);
+        &refcnt,
-        return 0;
+#endif
-}
+        NULL,
+};
-static int set_obsolete(const char *val, struct kernel_param *kp)
-{
-        unsigned int min, max;
-        unsigned int size, maxsize;
-        int dummy;
-        char *endp;
-        const char *p;
-        struct obsolete_modparm *obsparm = kp->arg;
-        if (!val) {
-                printk(KERN_ERR "Parameter %s needs an argument\n", kp->name);
-                return -EINVAL;
-        }
-        /* type is: [min[-max]]{b,h,i,l,s} */
-        p = obsparm->type;
-        min = simple_strtol(p, &endp, 10);
-        if (endp == obsparm->type)
-                min = max = 1;
-        else if (*endp == '-') {
-                p = endp+1;
-                max = simple_strtol(p, &endp, 10);
-        } else
-                max = min;
-        switch (*endp) {
-        case 'b':
-                return param_array(kp->name, val, min, max, obsparm->addr,
-                                   1, param_set_byte, &dummy);
-        case 'h':
-                return param_array(kp->name, val, min, max, obsparm->addr,
-                                   sizeof(short), param_set_short, &dummy);
-        case 'i':
-                return param_array(kp->name, val, min, max, obsparm->addr,
-                                   sizeof(int), param_set_int, &dummy);
-        case 'l':
-                return param_array(kp->name, val, min, max, obsparm->addr,
-                                   sizeof(long), param_set_long, &dummy);
-        case 's':
-                return param_array(kp->name, val, min, max, obsparm->addr,
-                                   sizeof(char *), param_set_charp, &dummy);
-        case 'c':
-                /* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars,
-                   and the decl is "char xxx[5][50];" */
-                p = endp+1;
-                maxsize = simple_strtol(p, &endp, 10);
-                /* We check lengths here (yes, this is a hack). */
-                p = val;
-                while (p[size = strcspn(p, ",")]) {
-                        if (size >= maxsize) 
-                                goto oversize;
-                        p += size+1;
-                }
-                if (size >= maxsize) 
-                        goto oversize;
-                return param_array(kp->name, val, min, max, obsparm->addr,
-                                   maxsize, obsparm_copy_string, &dummy);
-        }
-        printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type);
-        return -EINVAL;
- oversize:
-        printk(KERN_ERR
-               "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize);
-        return -EINVAL;
-}
-static int obsolete_params(const char *name,
-                           char *args,
-                           struct obsolete_modparm obsparm[],
-                           unsigned int num,
-                           Elf_Shdr *sechdrs,
-                           unsigned int symindex,
-                           const char *strtab)
-{
-        struct kernel_param *kp;
-        unsigned int i;
-        int ret;
-        kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL);
-        if (!kp)
-                return -ENOMEM;
-        for (i = 0; i < num; i++) {
-                char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)];
-                snprintf(sym_name, sizeof(sym_name), "%s%s",
-                         MODULE_SYMBOL_PREFIX, obsparm[i].name);
-                kp[i].name = obsparm[i].name;
-                kp[i].perm = 000;
-                kp[i].set = set_obsolete;
-                kp[i].get = NULL;
-                obsparm[i].addr
-                        = (void *)find_local_symbol(sechdrs, symindex, strtab,
-                                                    sym_name);
-                if (!obsparm[i].addr) {
-                        printk("%s: falsely claims to have parameter %s\n",
-                               name, obsparm[i].name);
-                        ret = -EINVAL;
-                        goto out;
-                }
-                kp[i].arg = &obsparm[i];
-        }
-        ret = parse_args(name, args, kp, num, NULL);
- out:
-        kfree(kp);
-        return ret;
-}
-#else
-static int obsolete_params(const char *name,
-                           char *args,
-                           struct obsolete_modparm obsparm[],
-                           unsigned int num,
-                           Elf_Shdr *sechdrs,
-                           unsigned int symindex,
-                           const char *strtab)
-{
-        if (num != 0)
-                printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
-                       name);
-        return 0;
-}
-#endif /* CONFIG_OBSOLETE_MODPARM */
 static const char vermagic[] = VERMAGIC_STRING;
@@ -1056,37 +950,28 @@ static inline void remove_sect_attrs(struct module *mod)
 }
 #endif /* CONFIG_KALLSYMS */
-#ifdef CONFIG_MODULE_UNLOAD
-static inline int module_add_refcnt_attr(struct module *mod)
-{
-        return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr);
-}
-static void module_remove_refcnt_attr(struct module *mod)
-{
-        return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr);
-}
-#else
-static inline int module_add_refcnt_attr(struct module *mod)
-{
-        return 0;
-}
-static void module_remove_refcnt_attr(struct module *mod)
-{
-}
-#endif
-#ifdef CONFIG_MODULE_UNLOAD
 static int module_add_modinfo_attrs(struct module *mod)
 {
        struct module_attribute *attr;
+        struct module_attribute *temp_attr;
        int error = 0;
        int i;
+        mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+                                        (ARRAY_SIZE(modinfo_attrs) + 1)),
+                                        GFP_KERNEL);
+        if (!mod->modinfo_attrs)
+                return -ENOMEM;
+        temp_attr = mod->modinfo_attrs;
        for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
                if (!attr->test ||
-                    (attr->test && attr->test(mod)))
+                    (attr->test && attr->test(mod))) {
-                        error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr);
+                        memcpy(temp_attr, attr, sizeof(*temp_attr));
+                        temp_attr->attr.owner = mod;
+                        error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
+                        ++temp_attr;
+                }
        }
        return error;
 }
@@ -1096,12 +981,16 @@ static void module_remove_modinfo_attrs(struct module *mod)
        struct module_attribute *attr;
        int i;
-        for (i = 0; (attr = modinfo_attrs[i]); i++) {
+        for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+                /* pick a field to test for end of list */
+                if (!attr->attr.name)
+                        break;
                sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
-                attr->free(mod);
+                if (attr->free)
+                        attr->free(mod);
        }
+        kfree(mod->modinfo_attrs);
 }
-#endif
 static int mod_sysfs_setup(struct module *mod,
                           struct kernel_param *kparam,
@@ -1119,19 +1008,13 @@ static int mod_sysfs_setup(struct module *mod,
        if (err)
                goto out;
-        err = module_add_refcnt_attr(mod);
-        if (err)
-                goto out_unreg;
        err = module_param_sysfs_setup(mod, kparam, num_params);
        if (err)
                goto out_unreg;
-#ifdef CONFIG_MODULE_UNLOAD
        err = module_add_modinfo_attrs(mod);
        if (err)
                goto out_unreg;
-#endif
        return 0;
@@ -1143,10 +1026,7 @@ out:
 static void mod_kobject_remove(struct module *mod)
 {
-#ifdef CONFIG_MODULE_UNLOAD
        module_remove_modinfo_attrs(mod);
-#endif
-        module_remove_refcnt_attr(mod);
        module_param_sysfs_remove(mod);
        kobject_unregister(&mod->mkobj.kobj);
@@ -1374,6 +1254,7 @@ static inline int license_is_gpl_compatible(const char *license)
                || strcmp(license, "GPL v2") == 0
                || strcmp(license, "GPL and additional rights") == 0
                || strcmp(license, "Dual BSD/GPL") == 0
+                || strcmp(license, "Dual MIT/GPL") == 0
                || strcmp(license, "Dual MPL/GPL") == 0);
 }
@@ -1424,7 +1305,6 @@ static char *get_modinfo(Elf_Shdr *sechdrs,
        return NULL;
 }
-#ifdef CONFIG_MODULE_UNLOAD
 static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
                          unsigned int infoindex)
 {
@@ -1439,23 +1319,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
                                                attr->attr.name));
        }
 }
-#endif
 #ifdef CONFIG_KALLSYMS
 int is_exported(const char *name, const struct module *mod)
 {
-        unsigned int i;
+        if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
+                return 1;
-        if (!mod) {
+        else
-                for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++)
+                if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
-                        if (strcmp(__start___ksymtab[i].name, name) == 0)
-                                return 1;
-                return 0;
-        }
-        for (i = 0; i < mod->num_syms; i++)
-                if (strcmp(mod->syms[i].name, name) == 0)
                        return 1;
-        return 0;
+                else
+                        return 0;
 }
 /* As per nm */
@@ -1537,8 +1411,8 @@ static struct module *load_module(void __user *umod,
        char *secstrings, *args, *modmagic, *strtab = NULL;
        unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
                exportindex, modindex, obsparmindex, infoindex, gplindex,
-                crcindex, gplcrcindex, versindex, pcpuindex;
+                crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
-        long arglen;
+                gplfuturecrcindex;
        struct module *mod;
        long err = 0;
        void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1618,8 +1492,10 @@ static struct module *load_module(void __user *umod,
        /* Optional sections */
        exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
        gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
+        gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
        crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
        gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
+        gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
        setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
        exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
        obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
@@ -1655,23 +1531,11 @@ static struct module *load_module(void __user *umod,
        }
        /* Now copy in args */
-        arglen = strlen_user(uargs);
+        args = strndup_user(uargs, ~0UL >> 1);
-        if (!arglen) {
+        if (IS_ERR(args)) {
-                err = -EFAULT;
+                err = PTR_ERR(args);
                goto free_hdr;
        }
-        args = kmalloc(arglen, GFP_KERNEL);
-        if (!args) {
-                err = -ENOMEM;
-                goto free_hdr;
-        }
-        if (copy_from_user(args, uargs, arglen) != 0) {
-                err = -EFAULT;
-                goto free_mod;
-        }
-        /* Userspace could have altered the string after the strlen_user() */
-        args[arglen - 1] = '\0';
        if (find_module(mod->name)) {
                err = -EEXIST;
@@ -1755,10 +1619,8 @@ static struct module *load_module(void __user *umod,
        if (strcmp(mod->name, "driverloader") == 0)
                add_taint(TAINT_PROPRIETARY_MODULE);
-#ifdef CONFIG_MODULE_UNLOAD
        /* Set up MODINFO_ATTR fields */
        setup_modinfo(mod, sechdrs, infoindex);
-#endif
        /* Fix up syms, so that st_value is a pointer to location. */
        err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
@@ -1775,10 +1637,16 @@ static struct module *load_module(void __user *umod,
        mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
        if (gplcrcindex)
                mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
+        mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
+                                        sizeof(*mod->gpl_future_syms);
+        mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
+        if (gplfuturecrcindex)
+                mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
 #ifdef CONFIG_MODVERSIONS
        if ((mod->num_syms && !crcindex) || 
-            (mod->num_gpl_syms && !gplcrcindex)) {
+            (mod->num_gpl_syms && !gplcrcindex) ||
+            (mod->num_gpl_future_syms && !gplfuturecrcindex)) {
                printk(KERN_WARNING "%s: No versions for exported symbols."
                       " Tainting kernel.\n", mod->name);
                add_taint(TAINT_FORCED_MODULE);
@@ -1847,27 +1715,17 @@ static struct module *load_module(void __user *umod,
        set_fs(old_fs);
        mod->args = args;
-        if (obsparmindex) {
+        if (obsparmindex)
-                err = obsolete_params(mod->name, mod->args,
+                printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
-                                      (struct obsolete_modparm *)
+                       mod->name);
-                                      sechdrs[obsparmindex].sh_addr,
-                                      sechdrs[obsparmindex].sh_size
+        /* Size of section 0 is 0, so this works well if no params */
-                                      / sizeof(struct obsolete_modparm),
+        err = parse_args(mod->name, mod->args,
-                                      sechdrs, symindex,
+                         (struct kernel_param *)
-                                      (char *)sechdrs[strindex].sh_addr);
+                         sechdrs[setupindex].sh_addr,
-                if (setupindex)
+                         sechdrs[setupindex].sh_size
-                        printk(KERN_WARNING "%s: Ignoring new-style "
+                         / sizeof(struct kernel_param),
-                               "parameters in presence of obsolete ones\n",
+                         NULL);
-                               mod->name);
-        } else {
-                /* Size of section 0 is 0, so this works well if no params */
-                err = parse_args(mod->name, mod->args,
-                                 (struct kernel_param *)
-                                 sechdrs[setupindex].sh_addr,
-                                 sechdrs[setupindex].sh_size
-                                 / sizeof(struct kernel_param),
-                                 NULL);
-        }
        if (err < 0)
                goto arch_cleanup;
@@ -1933,13 +1791,13 @@ sys_init_module(void __user *umod,
                return -EPERM;
        /* Only one module load at a time, please */
-        if (down_interruptible(&module_mutex) != 0)
+        if (mutex_lock_interruptible(&module_mutex) != 0)
                return -EINTR;
        /* Do all the hard work */
        mod = load_module(umod, len, uargs);
        if (IS_ERR(mod)) {
-                up(&module_mutex);
+                mutex_unlock(&module_mutex);
                return PTR_ERR(mod);
        }
@@ -1948,11 +1806,10 @@ sys_init_module(void __user *umod,
        stop_machine_run(__link_module, mod, NR_CPUS);
        /* Drop lock so they can recurse */
-        up(&module_mutex);
+        mutex_unlock(&module_mutex);
-        down(&notify_mutex);
+        blocking_notifier_call_chain(&module_notify_list,
-        notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
+                        MODULE_STATE_COMING, mod);
-        up(&notify_mutex);
        /* Start the module */
        if (mod->init != NULL)
@@ -1967,15 +1824,15 @@ sys_init_module(void __user *umod,
                               mod->name);
                else {
                        module_put(mod);
-                        down(&module_mutex);
+                        mutex_lock(&module_mutex);
                        free_module(mod);
-                        up(&module_mutex);
+                        mutex_unlock(&module_mutex);
                }
                return ret;
        }
        /* Now it's a first class citizen! */
-        down(&module_mutex);
+        mutex_lock(&module_mutex);
        mod->state = MODULE_STATE_LIVE;
        /* Drop initial reference. */
        module_put(mod);
@@ -1983,7 +1840,7 @@ sys_init_module(void __user *umod,
        mod->module_init = NULL;
        mod->init_size = 0;
        mod->init_text_size = 0;
-        up(&module_mutex);
+        mutex_unlock(&module_mutex);
        return 0;
 }
@@ -2073,7 +1930,7 @@ struct module *module_get_kallsym(unsigned int symnum,
 {
        struct module *mod;
-        down(&module_mutex);
+        mutex_lock(&module_mutex);
        list_for_each_entry(mod, &modules, list) {
                if (symnum < mod->num_symtab) {
                        *value = mod->symtab[symnum].st_value;
@@ -2081,12 +1938,12 @@ struct module *module_get_kallsym(unsigned int symnum,
                        strncpy(namebuf,
                                mod->strtab + mod->symtab[symnum].st_name,
                                127);
-                        up(&module_mutex);
+                        mutex_unlock(&module_mutex);
                        return mod;
                }
                symnum -= mod->num_symtab;
        }
-        up(&module_mutex);
+        mutex_unlock(&module_mutex);
        return NULL;
 }
@@ -2129,7 +1986,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        struct list_head *i;
        loff_t n = 0;
-        down(&module_mutex);
+        mutex_lock(&module_mutex);
        list_for_each(i, &modules) {
                if (n++ == *pos)
                        break;
@@ -2150,7 +2007,7 @@ static void *m_next(struct seq_file *m, void *p, loff_t *pos)
 static void m_stop(struct seq_file *m, void *p)
 {
-        up(&module_mutex);
+        mutex_unlock(&module_mutex);
 }
 static int m_show(struct seq_file *m, void *p)
diff --git a/kernel/panic.c b/kernel/panic.c
index c5c4ab2558..cc2a4c9c36 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,13 +20,15 @@
 #include <linux/nmi.h>
 #include <linux/kexec.h>
-int panic_timeout;
 int panic_on_oops;
 int tainted;
+static int pause_on_oops;
+static int pause_on_oops_flag;
+static DEFINE_SPINLOCK(pause_on_oops_lock);
-EXPORT_SYMBOL(panic_timeout);
+int panic_timeout;
-struct notifier_block *panic_notifier_list;
+ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 EXPORT_SYMBOL(panic_notifier_list);
@@ -94,7 +96,7 @@ NORET_TYPE void panic(const char * fmt, ...)
        smp_send_stop();
 #endif
-        notifier_call_chain(&panic_notifier_list, 0, buf);
+        atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
        if (!panic_blink)
                panic_blink = no_blink;
@@ -130,6 +132,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 #endif
        local_irq_enable();
        for (i = 0;;) {
+                touch_softlockup_watchdog();
                i += panic_blink(i);
                mdelay(1);
                i++;
@@ -173,3 +176,95 @@ void add_taint(unsigned flag)
        tainted |= flag;
 }
 EXPORT_SYMBOL(add_taint);
+static int __init pause_on_oops_setup(char *str)
+{
+        pause_on_oops = simple_strtoul(str, NULL, 0);
+        return 1;
+}
+__setup("pause_on_oops=", pause_on_oops_setup);
+static void spin_msec(int msecs)
+{
+        int i;
+        for (i = 0; i < msecs; i++) {
+                touch_nmi_watchdog();
+                mdelay(1);
+        }
+}
+/*
+ * It just happens that oops_enter() and oops_exit() are identically
+ * implemented...
+ */
+static void do_oops_enter_exit(void)
+{
+        unsigned long flags;
+        static int spin_counter;
+        if (!pause_on_oops)
+                return;
+        spin_lock_irqsave(&pause_on_oops_lock, flags);
+        if (pause_on_oops_flag == 0) {
+                /* This CPU may now print the oops message */
+                pause_on_oops_flag = 1;
+        } else {
+                /* We need to stall this CPU */
+                if (!spin_counter) {
+                        /* This CPU gets to do the counting */
+                        spin_counter = pause_on_oops;
+                        do {
+                                spin_unlock(&pause_on_oops_lock);
+                                spin_msec(MSEC_PER_SEC);
+                                spin_lock(&pause_on_oops_lock);
+                        } while (--spin_counter);
+                        pause_on_oops_flag = 0;
+                } else {
+                        /* This CPU waits for a different one */
+                        while (spin_counter) {
+                                spin_unlock(&pause_on_oops_lock);
+                                spin_msec(1);
+                                spin_lock(&pause_on_oops_lock);
+                        }
+                }
+        }
+        spin_unlock_irqrestore(&pause_on_oops_lock, flags);
+}
+/*
+ * Return true if the calling CPU is allowed to print oops-related info.  This
+ * is a bit racy..
+ */
+int oops_may_print(void)
+{
+        return pause_on_oops_flag == 0;
+}
+/*
+ * Called when the architecture enters its oops handler, before it prints
+ * anything.  If this is the first CPU to oops, and it's oopsing the first time
+ * then let it proceed.
+ *
+ * This is all enabled by the pause_on_oops kernel boot option.  We do all this
+ * to ensure that oopses don't scroll off the screen.  It has the side-effect
+ * of preventing later-oopsing CPUs from mucking up the display, too.
+ *
+ * It turns out that the CPU which is allowed to print ends up pausing for the
+ * right duration, whereas all the other CPUs pause for twice as long: once in
+ * oops_enter(), once in oops_exit().
+ */
+void oops_enter(void)
+{
+        do_oops_enter_exit();
+}
+/*
+ * Called when the architecture exits its oops handler, after printing
+ * everything.
+ */
+void oops_exit(void)
+{
+        do_oops_enter_exit();
+}
diff --git a/kernel/params.c b/kernel/params.c
index c76ad25e6a..af43ecdc8d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,7 +31,7 @@
 #define DEBUGP(fmt, a...)
 #endif
-static inline int dash2underscore(char c)
+static inline char dash2underscore(char c)
 {
        if (c == '-')
                return '_';
@@ -265,12 +265,12 @@ int param_get_invbool(char *buffer, struct kernel_param *kp)
 }
 /* We cheat here and temporarily mangle the string. */
-int param_array(const char *name,
+static int param_array(const char *name,
-                const char *val,
+                       const char *val,
-                unsigned int min, unsigned int max,
+                       unsigned int min, unsigned int max,
-                void *elem, int elemsize,
+                       void *elem, int elemsize,
-                int (*set)(const char *, struct kernel_param *kp),
+                       int (*set)(const char *, struct kernel_param *kp),
-                int *num)
+                       int *num)
 {
        int ret;
        struct kernel_param kp;
@@ -638,13 +638,8 @@ static ssize_t module_attr_show(struct kobject *kobj,
        if (!attribute->show)
                return -EIO;
-        if (!try_module_get(mk->mod))
-                return -ENODEV;
        ret = attribute->show(attribute, mk->mod, buf);
-        module_put(mk->mod);
        return ret;
 }
@@ -662,13 +657,8 @@ static ssize_t module_attr_store(struct kobject *kobj,
        if (!attribute->store)
                return -EIO;
-        if (!try_module_get(mk->mod))
-                return -ENODEV;
        ret = attribute->store(attribute, mk->mod, buf, len);
-        module_put(mk->mod);
        return ret;
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index 1acc072469..eeb836b65c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -28,8 +28,9 @@
 #include <linux/hash.h>
 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
-static struct hlist_head *pid_hash[PIDTYPE_MAX];
+static struct hlist_head *pid_hash;
 static int pidhash_shift;
+static kmem_cache_t *pid_cachep;
 int pid_max = PID_MAX_DEFAULT;
 int last_pid;
@@ -60,9 +61,22 @@ typedef struct pidmap {
 static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
         { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
+/*
+ * Note: disable interrupts while the pidmap_lock is held as an
+ * interrupt might come in and do read_lock(&tasklist_lock).
+ *
+ * If we don't disable interrupts there is a nasty deadlock between
+ * detach_pid()->free_pid() and another cpu that does
+ * spin_lock(&pidmap_lock) followed by an interrupt routine that does
+ * read_lock(&tasklist_lock);
+ *
+ * After we clean up the tasklist_lock and know there are no
+ * irq handlers that take it we can leave the interrupts enabled.
+ * For now it is easier to be safe than to prove it can't happen.
+ */
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-fastcall void free_pidmap(int pid)
+static fastcall void free_pidmap(int pid)
 {
        pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
        int offset = pid & BITS_PER_PAGE_MASK;
@@ -71,7 +85,7 @@ fastcall void free_pidmap(int pid)
        atomic_inc(&map->nr_free);
 }
-int alloc_pidmap(void)
+static int alloc_pidmap(void)
 {
        int i, offset, max_scan, pid, last = last_pid;
        pidmap_t *map;
@@ -89,12 +103,12 @@ int alloc_pidmap(void)
                         * Free the page if someone raced with us
                         * installing it:
                         */
-                        spin_lock(&pidmap_lock);
+                        spin_lock_irq(&pidmap_lock);
                        if (map->page)
                                free_page(page);
                        else
                                map->page = (void *)page;
-                        spin_unlock(&pidmap_lock);
+                        spin_unlock_irq(&pidmap_lock);
                        if (unlikely(!map->page))
                                break;
                }
@@ -131,13 +145,73 @@ int alloc_pidmap(void)
        return -1;
 }
-struct pid * fastcall find_pid(enum pid_type type, int nr)
+fastcall void put_pid(struct pid *pid)
+{
+        if (!pid)
+                return;
+        if ((atomic_read(&pid->count) == 1) ||
+             atomic_dec_and_test(&pid->count))
+                kmem_cache_free(pid_cachep, pid);
+}
+static void delayed_put_pid(struct rcu_head *rhp)
+{
+        struct pid *pid = container_of(rhp, struct pid, rcu);
+        put_pid(pid);
+}
+fastcall void free_pid(struct pid *pid)
+{
+        /* We can be called with write_lock_irq(&tasklist_lock) held */
+        unsigned long flags;
+        spin_lock_irqsave(&pidmap_lock, flags);
+        hlist_del_rcu(&pid->pid_chain);
+        spin_unlock_irqrestore(&pidmap_lock, flags);
+        free_pidmap(pid->nr);
+        call_rcu(&pid->rcu, delayed_put_pid);
+}
+struct pid *alloc_pid(void)
+{
+        struct pid *pid;
+        enum pid_type type;
+        int nr = -1;
+        pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
+        if (!pid)
+                goto out;
+        nr = alloc_pidmap();
+        if (nr < 0)
+                goto out_free;
+        atomic_set(&pid->count, 1);
+        pid->nr = nr;
+        for (type = 0; type < PIDTYPE_MAX; ++type)
+                INIT_HLIST_HEAD(&pid->tasks[type]);
+        spin_lock_irq(&pidmap_lock);
+        hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
+        spin_unlock_irq(&pidmap_lock);
+out:
+        return pid;
+out_free:
+        kmem_cache_free(pid_cachep, pid);
+        pid = NULL;
+        goto out;
+}
+struct pid * fastcall find_pid(int nr)
 {
        struct hlist_node *elem;
        struct pid *pid;
        hlist_for_each_entry_rcu(pid, elem,
-                        &pid_hash[type][pid_hashfn(nr)], pid_chain) {
+                        &pid_hash[pid_hashfn(nr)], pid_chain) {
                if (pid->nr == nr)
                        return pid;
        }
@@ -146,105 +220,80 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
 int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
 {
-        struct pid *pid, *task_pid;
+        struct pid_link *link;
+        struct pid *pid;
-        task_pid = &task->pids[type];
-        pid = find_pid(type, nr);
-        task_pid->nr = nr;
-        if (pid == NULL) {
-                INIT_LIST_HEAD(&task_pid->pid_list);
-                hlist_add_head_rcu(&task_pid->pid_chain,
-                                   &pid_hash[type][pid_hashfn(nr)]);
-        } else {
-                INIT_HLIST_NODE(&task_pid->pid_chain);
-                list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
-        }
-        return 0;
-}
-static fastcall int __detach_pid(task_t *task, enum pid_type type)
-{
-        struct pid *pid, *pid_next;
-        int nr = 0;
-        pid = &task->pids[type];
-        if (!hlist_unhashed(&pid->pid_chain)) {
-                if (list_empty(&pid->pid_list)) {
+        WARN_ON(!task->pid); /* to be removed soon */
-                        nr = pid->nr;
+        WARN_ON(!nr); /* to be removed soon */
-                        hlist_del_rcu(&pid->pid_chain);
-                } else {
-                        pid_next = list_entry(pid->pid_list.next,
-                                                struct pid, pid_list);
-                        /* insert next pid from pid_list to hash */
-                        hlist_replace_rcu(&pid->pid_chain,
-                                          &pid_next->pid_chain);
-                }
-        }
-        list_del_rcu(&pid->pid_list);
+        link = &task->pids[type];
-        pid->nr = 0;
+        link->pid = pid = find_pid(nr);
+        hlist_add_head_rcu(&link->node, &pid->tasks[type]);
-        return nr;
+        return 0;
 }
 void fastcall detach_pid(task_t *task, enum pid_type type)
 {
-        int tmp, nr;
+        struct pid_link *link;
+        struct pid *pid;
+        int tmp;
-        nr = __detach_pid(task, type);
+        link = &task->pids[type];
-        if (!nr)
+        pid = link->pid;
-                return;
+        hlist_del_rcu(&link->node);
+        link->pid = NULL;
        for (tmp = PIDTYPE_MAX; --tmp >= 0; )
-                if (tmp != type && find_pid(tmp, nr))
+                if (!hlist_empty(&pid->tasks[tmp]))
                        return;
-        free_pidmap(nr);
+        free_pid(pid);
 }
-task_t *find_task_by_pid_type(int type, int nr)
+struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
 {
-        struct pid *pid;
+        struct task_struct *result = NULL;
+        if (pid) {
-        pid = find_pid(type, nr);
+                struct hlist_node *first;
-        if (!pid)
+                first = rcu_dereference(pid->tasks[type].first);
-                return NULL;
+                if (first)
+                        result = hlist_entry(first, struct task_struct, pids[(type)].node);
+        }
+        return result;
+}
-        return pid_task(&pid->pid_list, type);
+/*
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
+ */
+task_t *find_task_by_pid_type(int type, int nr)
+{
+        return pid_task(find_pid(nr), type);
 }
 EXPORT_SYMBOL(find_task_by_pid_type);
-/*
+struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
- * This function switches the PIDs if a non-leader thread calls
+{
- * sys_execve() - this must be done without releasing the PID.
+        struct task_struct *result;
- * (which a detach_pid() would eventually do.)
+        rcu_read_lock();
- */
+        result = pid_task(pid, type);
-void switch_exec_pids(task_t *leader, task_t *thread)
+        if (result)
+                get_task_struct(result);
+        rcu_read_unlock();
+        return result;
+}
+struct pid *find_get_pid(pid_t nr)
 {
-        __detach_pid(leader, PIDTYPE_PID);
+        struct pid *pid;
-        __detach_pid(leader, PIDTYPE_TGID);
-        __detach_pid(leader, PIDTYPE_PGID);
+        rcu_read_lock();
-        __detach_pid(leader, PIDTYPE_SID);
+        pid = get_pid(find_pid(nr));
+        rcu_read_unlock();
-        __detach_pid(thread, PIDTYPE_PID);
-        __detach_pid(thread, PIDTYPE_TGID);
+        return pid;
-        leader->pid = leader->tgid = thread->pid;
-        thread->pid = thread->tgid;
-        attach_pid(thread, PIDTYPE_PID, thread->pid);
-        attach_pid(thread, PIDTYPE_TGID, thread->tgid);
-        attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
-        attach_pid(thread, PIDTYPE_SID, thread->signal->session);
-        list_add_tail(&thread->tasks, &init_task.tasks);
-        attach_pid(leader, PIDTYPE_PID, leader->pid);
-        attach_pid(leader, PIDTYPE_TGID, leader->tgid);
-        attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
-        attach_pid(leader, PIDTYPE_SID, leader->signal->session);
 }
 /*
@@ -254,7 +303,7 @@ void switch_exec_pids(task_t *leader, task_t *thread)
 */
 void __init pidhash_init(void)
 {
-        int i, j, pidhash_size;
+        int i, pidhash_size;
        unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
        pidhash_shift = max(4, fls(megabytes * 4));
@@ -263,30 +312,23 @@ void __init pidhash_init(void)
        printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
                pidhash_size, pidhash_shift,
-                PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head));
+                pidhash_size * sizeof(struct hlist_head));
-        for (i = 0; i < PIDTYPE_MAX; i++) {
+        pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
-                pid_hash[i] = alloc_bootmem(pidhash_size *
+        if (!pid_hash)
-                                        sizeof(*(pid_hash[i])));
+                panic("Could not alloc pidhash!\n");
-                if (!pid_hash[i])
+        for (i = 0; i < pidhash_size; i++)
-                        panic("Could not alloc pidhash!\n");
+                INIT_HLIST_HEAD(&pid_hash[i]);
-                for (j = 0; j < pidhash_size; j++)
-                        INIT_HLIST_HEAD(&pid_hash[i][j]);
-        }
 }
 void __init pidmap_init(void)
 {
-        int i;
        pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
+        /* Reserve PID 0. We never call free_pidmap(0) */
        set_bit(0, pidmap_array->page);
        atomic_dec(&pidmap_array->nr_free);
-        /*
+        pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
-         * Allocate PID 0, and hash it via all PID types:
+                                        __alignof__(struct pid),
-         */
+                                        SLAB_PANIC, NULL, NULL);
-        for (i = 0; i < PIDTYPE_MAX; i++)
-                attach_pid(current, i, 0);
 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 216f574b5f..ac6dc87444 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -35,6 +35,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
@@ -144,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
                            struct itimerspec *, struct itimerspec *);
 static int common_timer_del(struct k_itimer *timer);
-static int posix_timer_fn(void *data);
+static int posix_timer_fn(struct hrtimer *data);
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
@@ -250,15 +251,18 @@ __initcall(init_posix_timers);
 static void schedule_next_timer(struct k_itimer *timr)
 {
+        struct hrtimer *timer = &timr->it.real.timer;
        if (timr->it.real.interval.tv64 == 0)
                return;
-        timr->it_overrun += hrtimer_forward(&timr->it.real.timer,
+        timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
                                            timr->it.real.interval);
        timr->it_overrun_last = timr->it_overrun;
        timr->it_overrun = -1;
        ++timr->it_requeue_pending;
-        hrtimer_restart(&timr->it.real.timer);
+        hrtimer_restart(timer);
 }
 /*
@@ -330,13 +334,14 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
 */
-static int posix_timer_fn(void *data)
+static int posix_timer_fn(struct hrtimer *timer)
 {
-        struct k_itimer *timr = data;
+        struct k_itimer *timr;
        unsigned long flags;
        int si_private = 0;
        int ret = HRTIMER_NORESTART;
+        timr = container_of(timer, struct k_itimer, it.real.timer);
        spin_lock_irqsave(&timr->it_lock, flags);
        if (timr->it.real.interval.tv64 != 0)
@@ -350,9 +355,11 @@ static int posix_timer_fn(void *data)
                 */
                if (timr->it.real.interval.tv64 != 0) {
                        timr->it_overrun +=
-                                hrtimer_forward(&timr->it.real.timer,
+                                hrtimer_forward(timer,
+                                                timer->base->softirq_time,
                                                timr->it.real.interval);
                        ret = HRTIMER_RESTART;
+                        ++timr->it_requeue_pending;
                }
        }
@@ -601,38 +608,41 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
 static void
 common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 {
-        ktime_t remaining;
+        ktime_t now, remaining, iv;
        struct hrtimer *timer = &timr->it.real.timer;
        memset(cur_setting, 0, sizeof(struct itimerspec));
-        remaining = hrtimer_get_remaining(timer);
-        /* Time left ? or timer pending */
+        iv = timr->it.real.interval;
-        if (remaining.tv64 > 0 || hrtimer_active(timer))
-                goto calci;
        /* interval timer ? */
-        if (timr->it.real.interval.tv64 == 0)
+        if (iv.tv64)
+                cur_setting->it_interval = ktime_to_timespec(iv);
+        else if (!hrtimer_active(timer) &&
+                 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
                return;
+        now = timer->base->get_time();
        /*
-         * When a requeue is pending or this is a SIGEV_NONE timer
+         * When a requeue is pending or this is a SIGEV_NONE
-         * move the expiry time forward by intervals, so expiry is >
+         * timer move the expiry time forward by intervals, so
-         * now.
+         * expiry is > now.
         */
-        if (timr->it_requeue_pending & REQUEUE_PENDING ||
+        if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
-            (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+            (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
-                timr->it_overrun +=
+                timr->it_overrun += hrtimer_forward(timer, now, iv);
-                        hrtimer_forward(timer, timr->it.real.interval);
-                remaining = hrtimer_get_remaining(timer);
+        remaining = ktime_sub(timer->expires, now);
-        }
- calci:
-        /* interval timer ? */
-        if (timr->it.real.interval.tv64 != 0)
-                cur_setting->it_interval =
-                        ktime_to_timespec(timr->it.real.interval);
        /* Return 0 only, when the timer is expired and not pending */
-        if (remaining.tv64 <= 0)
+        if (remaining.tv64 <= 0) {
-                cur_setting->it_value.tv_nsec = 1;
+                /*
-        else
+                 * A single shot SIGEV_NONE timer must return 0, when
+                 * it is expired !
+                 */
+                if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
+                        cur_setting->it_value.tv_nsec = 1;
+        } else
                cur_setting->it_value = ktime_to_timespec(remaining);
 }
@@ -715,7 +725,6 @@ common_timer_set(struct k_itimer *timr, int flags,
        mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
        hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
-        timr->it.real.timer.data = timr;
        timr->it.real.timer.function = posix_timer_fn;
        timer->expires = timespec_to_ktime(new_setting->it_value);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9fd8d4f035..ce0dfb8f4a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -41,7 +41,7 @@ config SOFTWARE_SUSPEND
        depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
        ---help---
          Enable the possibility of suspending the machine.
-          It doesn't need APM.
+          It doesn't need ACPI or APM.
          You may suspend your machine by 'swsusp' or 'shutdown -z <time>' 
          (patch for sysvinit needed). 
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 04be7d0d96..8d0af3d37a 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -5,7 +5,7 @@ endif
 obj-y                           := main.o process.o console.o
 obj-$(CONFIG_PM_LEGACY)         += pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)  += swsusp.o disk.o snapshot.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)  += swsusp.o disk.o snapshot.o swap.o user.o
 obj-$(CONFIG_SUSPEND_SMP)       += smp.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0b43847dc9..81d4d982f3 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,17 +22,6 @@
 #include "power.h"
-extern suspend_disk_method_t pm_disk_mode;
-extern int swsusp_shrink_memory(void);
-extern int swsusp_suspend(void);
-extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
-extern int swsusp_check(void);
-extern int swsusp_read(struct pbe **pblist_ptr);
-extern void swsusp_close(void);
-extern int swsusp_resume(void);
 static int noresume = 0;
 char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
@@ -70,10 +59,6 @@ static void power_down(suspend_disk_method_t mode)
        while(1);
 }
-static int in_suspend __nosavedata = 0;
 static inline void platform_finish(void)
 {
        if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -87,7 +72,6 @@ static int prepare_processes(void)
        int error;
        pm_prepare_console();
-        sys_sync();
        disable_nonboot_cpus();
        if (freeze_processes()) {
@@ -145,7 +129,7 @@ int pm_suspend_disk(void)
        if (in_suspend) {
                device_resume();
                pr_debug("PM: writing image.\n");
-                error = swsusp_write(pagedir_nosave, nr_copy_pages);
+                error = swsusp_write();
                if (!error)
                        power_down(pm_disk_mode);
                else {
@@ -216,7 +200,7 @@ static int software_resume(void)
        pr_debug("PM: Reading swsusp image.\n");
-        if ((error = swsusp_read(&pagedir_nosave))) {
+        if ((error = swsusp_read())) {
                swsusp_free();
                goto Thaw;
        }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9cb235cba4..a6d9ef4600 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -103,7 +103,7 @@ static int suspend_prepare(suspend_state_t state)
 }
-static int suspend_enter(suspend_state_t state)
+int suspend_enter(suspend_state_t state)
 {
        int error = 0;
        unsigned long flags;
@@ -272,7 +272,7 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
                if (*s && !strncmp(buf, *s, len))
                        break;
        }
-        if (*s)
+        if (state < PM_SUSPEND_MAX && *s)
                error = enter_state(state);
        else
                error = -EINVAL;
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 33c508e857..84063ac8fc 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -25,6 +25,7 @@
 #include <linux/pm.h>
 #include <linux/pm_legacy.h>
 #include <linux/interrupt.h>
+#include <linux/mutex.h>
 int pm_active;
@@ -40,7 +41,7 @@ int pm_active;
 *      until a resume but that will be fine.
 */
 
-static DECLARE_MUTEX(pm_devs_lock);
+static DEFINE_MUTEX(pm_devs_lock);
 static LIST_HEAD(pm_devs);
 /**
@@ -67,32 +68,13 @@ struct pm_dev *pm_register(pm_dev_t type,
                dev->id = id;
                dev->callback = callback;
-                down(&pm_devs_lock);
+                mutex_lock(&pm_devs_lock);
                list_add(&dev->entry, &pm_devs);
-                up(&pm_devs_lock);
+                mutex_unlock(&pm_devs_lock);
        }
        return dev;
 }
-/**
- *      pm_unregister -  unregister a device with power management
- *      @dev: device to unregister
- *
- *      Remove a device from the power management notification lists. The
- *      dev passed must be a handle previously returned by pm_register.
- */
- 
-void pm_unregister(struct pm_dev *dev)
-{
-        if (dev) {
-                down(&pm_devs_lock);
-                list_del(&dev->entry);
-                up(&pm_devs_lock);
-                kfree(dev);
-        }
-}
 static void __pm_unregister(struct pm_dev *dev)
 {
        if (dev) {
@@ -118,7 +100,7 @@ void pm_unregister_all(pm_callback callback)
        if (!callback)
                return;
-        down(&pm_devs_lock);
+        mutex_lock(&pm_devs_lock);
        entry = pm_devs.next;
        while (entry != &pm_devs) {
                struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -126,7 +108,7 @@ void pm_unregister_all(pm_callback callback)
                if (dev->callback == callback)
                        __pm_unregister(dev);
        }
-        up(&pm_devs_lock);
+        mutex_unlock(&pm_devs_lock);
 }
 /**
@@ -234,7 +216,7 @@ int pm_send_all(pm_request_t rqst, void *data)
 {
        struct list_head *entry;
        
-        down(&pm_devs_lock);
+        mutex_lock(&pm_devs_lock);
        entry = pm_devs.next;
        while (entry != &pm_devs) {
                struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -246,18 +228,17 @@ int pm_send_all(pm_request_t rqst, void *data)
                                 */
                                if (rqst == PM_SUSPEND)
                                        pm_undo_all(dev);
-                                up(&pm_devs_lock);
+                                mutex_unlock(&pm_devs_lock);
                                return status;
                        }
                }
                entry = entry->next;
        }
-        up(&pm_devs_lock);
+        mutex_unlock(&pm_devs_lock);
        return 0;
 }
 EXPORT_SYMBOL(pm_register);
-EXPORT_SYMBOL(pm_unregister);
 EXPORT_SYMBOL(pm_unregister_all);
 EXPORT_SYMBOL(pm_send_all);
 EXPORT_SYMBOL(pm_active);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 388dba6808..f06f12f217 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -8,6 +8,7 @@ struct swsusp_info {
        int                     cpus;
        unsigned long           image_pages;
        unsigned long           pages;
+        unsigned long           size;
 } __attribute__((aligned(PAGE_SIZE)));
@@ -37,21 +38,79 @@ extern struct subsystem power_subsys;
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
-extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
+extern int in_suspend;
+extern dev_t swsusp_resume_device;
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 extern unsigned int count_data_pages(void);
-extern void free_pagedir(struct pbe *pblist);
-extern void release_eaten_pages(void);
+struct snapshot_handle {
-extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
+        loff_t          offset;
+        unsigned int    page;
+        unsigned int    page_offset;
+        unsigned int    prev;
+        struct pbe      *pbe;
+        void            *buffer;
+        unsigned int    buf_offset;
+};
+#define data_of(handle) ((handle).buffer + (handle).buf_offset)
+extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+int snapshot_image_loaded(struct snapshot_handle *handle);
+#define SNAPSHOT_IOC_MAGIC      '3'
+#define SNAPSHOT_FREEZE                 _IO(SNAPSHOT_IOC_MAGIC, 1)
+#define SNAPSHOT_UNFREEZE               _IO(SNAPSHOT_IOC_MAGIC, 2)
+#define SNAPSHOT_ATOMIC_SNAPSHOT        _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
+#define SNAPSHOT_ATOMIC_RESTORE         _IO(SNAPSHOT_IOC_MAGIC, 4)
+#define SNAPSHOT_FREE                   _IO(SNAPSHOT_IOC_MAGIC, 5)
+#define SNAPSHOT_SET_IMAGE_SIZE         _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
+#define SNAPSHOT_AVAIL_SWAP             _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
+#define SNAPSHOT_GET_SWAP_PAGE          _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
+#define SNAPSHOT_FREE_SWAP_PAGES        _IO(SNAPSHOT_IOC_MAGIC, 9)
+#define SNAPSHOT_SET_SWAP_FILE          _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
+#define SNAPSHOT_S2RAM                  _IO(SNAPSHOT_IOC_MAGIC, 11)
+#define SNAPSHOT_IOC_MAXNR      11
+/**
+ *      The bitmap is used for tracing allocated swap pages
+ *
+ *      The entire bitmap consists of a number of bitmap_page
+ *      structures linked with the help of the .next member.
+ *      Thus each page can be allocated individually, so we only
+ *      need to make 0-order memory allocations to create
+ *      the bitmap.
+ */
+#define BITMAP_PAGE_SIZE        (PAGE_SIZE - sizeof(void *))
+#define BITMAP_PAGE_CHUNKS      (BITMAP_PAGE_SIZE / sizeof(long))
+#define BITS_PER_CHUNK          (sizeof(long) * 8)
+#define BITMAP_PAGE_BITS        (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
+struct bitmap_page {
+        unsigned long           chunks[BITMAP_PAGE_CHUNKS];
+        struct bitmap_page      *next;
+};
+extern void free_bitmap(struct bitmap_page *bitmap);
+extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
+extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
+extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
+extern int swsusp_check(void);
+extern int swsusp_shrink_memory(void);
 extern void swsusp_free(void);
-extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
+extern int swsusp_suspend(void);
-extern unsigned int snapshot_nr_pages(void);
+extern int swsusp_resume(void);
-extern struct pbe *snapshot_pblist(void);
+extern int swsusp_read(void);
-extern void snapshot_pblist_set(struct pbe *pblist);
+extern int swsusp_write(void);
+extern void swsusp_close(void);
+extern int suspend_enter(suspend_state_t state);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 28de118f7a..b2a5f671d6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -12,11 +12,12 @@
 #include <linux/interrupt.h>
 #include <linux/suspend.h>
 #include <linux/module.h>
+#include <linux/syscalls.h>
 /* 
 * Timeout for stopping processes
 */
-#define TIMEOUT (6 * HZ)
+#define TIMEOUT (20 * HZ)
 static inline int freezeable(struct task_struct * p)
@@ -25,8 +26,7 @@ static inline int freezeable(struct task_struct * p)
            (p->flags & PF_NOFREEZE) ||
            (p->exit_state == EXIT_ZOMBIE) ||
            (p->exit_state == EXIT_DEAD) ||
-            (p->state == TASK_STOPPED) ||
+            (p->state == TASK_STOPPED))
-            (p->state == TASK_TRACED))
                return 0;
        return 1;
 }
@@ -54,38 +54,62 @@ void refrigerator(void)
        current->state = save;
 }
+static inline void freeze_process(struct task_struct *p)
+{
+        unsigned long flags;
+        if (!freezing(p)) {
+                freeze(p);
+                spin_lock_irqsave(&p->sighand->siglock, flags);
+                signal_wake_up(p, 0);
+                spin_unlock_irqrestore(&p->sighand->siglock, flags);
+        }
+}
 /* 0 = success, else # of processes that we failed to stop */
 int freeze_processes(void)
 {
-        int todo;
+        int todo, nr_user, user_frozen;
        unsigned long start_time;
        struct task_struct *g, *p;
        unsigned long flags;
        printk( "Stopping tasks: " );
        start_time = jiffies;
+        user_frozen = 0;
        do {
-                todo = 0;
+                nr_user = todo = 0;
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        if (!freezeable(p))
                                continue;
                        if (frozen(p))
                                continue;
+                        if (p->mm && !(p->flags & PF_BORROWED_MM)) {
-                        freeze(p);
+                                /* The task is a user-space one.
-                        spin_lock_irqsave(&p->sighand->siglock, flags);
+                                 * Freeze it unless there's a vfork completion
-                        signal_wake_up(p, 0);
+                                 * pending
-                        spin_unlock_irqrestore(&p->sighand->siglock, flags);
+                                 */
-                        todo++;
+                                if (!p->vfork_done)
+                                        freeze_process(p);
+                                nr_user++;
+                        } else {
+                                /* Freeze only if the user space is frozen */
+                                if (user_frozen)
+                                        freeze_process(p);
+                                todo++;
+                        }
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
+                todo += nr_user;
+                if (!user_frozen && !nr_user) {
+                        sys_sync();
+                        start_time = jiffies;
+                }
+                user_frozen = !nr_user;
                yield();                        /* Yield is okay here */
-                if (todo && time_after(jiffies, start_time + TIMEOUT)) {
+                if (todo && time_after(jiffies, start_time + TIMEOUT))
-                        printk( "\n" );
-                        printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
                        break;
-                }
        } while(todo);
        /* This does not unfreeze processes that are already frozen
@@ -94,8 +118,14 @@ int freeze_processes(void)
         * but it cleans up leftover PF_FREEZE requests.
         */
        if (todo) {
+                printk( "\n" );
+                printk(KERN_ERR " stopping tasks timed out "
+                        "after %d seconds (%d tasks remaining):\n",
+                        TIMEOUT / HZ, todo);
                read_lock(&tasklist_lock);
-                do_each_thread(g, p)
+                do_each_thread(g, p) {
+                        if (freezeable(p) && !frozen(p))
+                                printk(KERN_ERR "  %s\n", p->comm);
                        if (freezing(p)) {
                                pr_debug("  clean up: %s\n", p->comm);
                                p->flags &= ~PF_FREEZE;
@@ -103,7 +133,7 @@ int freeze_processes(void)
                                recalc_sigpending_tsk(p);
                                spin_unlock_irqrestore(&p->sighand->siglock, flags);
                        }
-                while_each_thread(g, p);
+                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
                return todo;
        }
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index 911fc62b82..5957312b2d 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -49,9 +49,7 @@ void enable_nonboot_cpus(void)
        printk("Thawing cpus ...\n");
        for_each_cpu_mask(cpu, frozen_cpus) {
-                error = smp_prepare_cpu(cpu);
+                error = cpu_up(cpu);
-                if (!error)
-                        error = cpu_up(cpu);
                if (!error) {
                        printk("CPU%d is up\n", cpu);
                        continue;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 41f66365f0..3eeedbb13b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,7 @@
 */
+#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/suspend.h>
@@ -34,7 +35,9 @@
 #include "power.h"
 struct pbe *pagedir_nosave;
-unsigned int nr_copy_pages;
+static unsigned int nr_copy_pages;
+static unsigned int nr_meta_pages;
+static unsigned long *buffer;
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
@@ -80,7 +83,7 @@ static int save_highmem_zone(struct zone *zone)
                void *kaddr;
                unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-                if (!(pfn%1000))
+                if (!(pfn%10000))
                        printk(".");
                if (!pfn_valid(pfn))
                        continue;
@@ -91,10 +94,8 @@ static int save_highmem_zone(struct zone *zone)
                 * corrected eventually when the cases giving rise to this
                 * are better understood.
                 */
-                if (PageReserved(page)) {
+                if (PageReserved(page))
-                        printk("highmem reserved page?!\n");
                        continue;
-                }
                BUG_ON(PageNosave(page));
                if (PageNosaveFree(page))
                        continue;
@@ -121,13 +122,15 @@ int save_highmem(void)
        struct zone *zone;
        int res = 0;
-        pr_debug("swsusp: Saving Highmem\n");
+        pr_debug("swsusp: Saving Highmem");
+        drain_local_pages();
        for_each_zone (zone) {
                if (is_highmem(zone))
                        res = save_highmem_zone(zone);
                if (res)
                        return res;
        }
+        printk("\n");
        return 0;
 }
@@ -237,14 +240,15 @@ static void copy_data_pages(struct pbe *pblist)
 *      free_pagedir - free pages allocated with alloc_pagedir()
 */
-void free_pagedir(struct pbe *pblist)
+static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
 {
        struct pbe *pbe;
        while (pblist) {
                pbe = (pblist + PB_PAGE_SKIP)->next;
                ClearPageNosave(virt_to_page(pblist));
-                ClearPageNosaveFree(virt_to_page(pblist));
+                if (clear_nosave_free)
+                        ClearPageNosaveFree(virt_to_page(pblist));
                free_page((unsigned long)pblist);
                pblist = pbe;
        }
@@ -303,7 +307,7 @@ struct eaten_page {
 static struct eaten_page *eaten_pages = NULL;
-void release_eaten_pages(void)
+static void release_eaten_pages(void)
 {
        struct eaten_page *p, *q;
@@ -378,7 +382,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
        if (!nr_pages)
                return NULL;
-        pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
        pblist = alloc_image_page(gfp_mask, safe_needed);
        /* FIXME: rewrite this ugly loop */
        for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
@@ -387,10 +390,10 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
                pbe->next = alloc_image_page(gfp_mask, safe_needed);
        }
        if (!pbe) { /* get_zeroed_page() failed */
-                free_pagedir(pblist);
+                free_pagedir(pblist, 1);
                pblist = NULL;
        } else
-                create_pbe_list(pblist, nr_pages);
+                create_pbe_list(pblist, nr_pages);
        return pblist;
 }
@@ -416,6 +419,10 @@ void swsusp_free(void)
                                }
                        }
        }
+        nr_copy_pages = 0;
+        nr_meta_pages = 0;
+        pagedir_nosave = NULL;
+        buffer = NULL;
 }
@@ -439,7 +446,7 @@ static int enough_free_mem(unsigned int nr_pages)
                (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
-int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
 {
        struct pbe *p;
@@ -506,7 +513,318 @@ asmlinkage int swsusp_save(void)
         */
        nr_copy_pages = nr_pages;
+        nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
        printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
        return 0;
 }
+static void init_header(struct swsusp_info *info)
+{
+        memset(info, 0, sizeof(struct swsusp_info));
+        info->version_code = LINUX_VERSION_CODE;
+        info->num_physpages = num_physpages;
+        memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
+        info->cpus = num_online_cpus();
+        info->image_pages = nr_copy_pages;
+        info->pages = nr_copy_pages + nr_meta_pages + 1;
+        info->size = info->pages;
+        info->size <<= PAGE_SHIFT;
+}
+/**
+ *      pack_orig_addresses - the .orig_address fields of the PBEs from the
+ *      list starting at @pbe are stored in the array @buf[] (1 page)
+ */
+static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
+{
+        int j;
+        for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+                buf[j] = pbe->orig_address;
+                pbe = pbe->next;
+        }
+        if (!pbe)
+                for (; j < PAGE_SIZE / sizeof(long); j++)
+                        buf[j] = 0;
+        return pbe;
+}
+/**
+ *      snapshot_read_next - used for reading the system memory snapshot.
+ *
+ *      On the first call to it @handle should point to a zeroed
+ *      snapshot_handle structure.  The structure gets updated and a pointer
+ *      to it should be passed to this function every next time.
+ *
+ *      The @count parameter should contain the number of bytes the caller
+ *      wants to read from the snapshot.  It must not be zero.
+ *
+ *      On success the function returns a positive number.  Then, the caller
+ *      is allowed to read up to the returned number of bytes from the memory
+ *      location computed by the data_of() macro.  The number returned
+ *      may be smaller than @count, but this only happens if the read would
+ *      cross a page boundary otherwise.
+ *
+ *      The function returns 0 to indicate the end of data stream condition,
+ *      and a negative number is returned on error.  In such cases the
+ *      structure pointed to by @handle is not updated and should not be used
+ *      any more.
+ */
+int snapshot_read_next(struct snapshot_handle *handle, size_t count)
+{
+        if (handle->page > nr_meta_pages + nr_copy_pages)
+                return 0;
+        if (!buffer) {
+                /* This makes the buffer be freed by swsusp_free() */
+                buffer = alloc_image_page(GFP_ATOMIC, 0);
+                if (!buffer)
+                        return -ENOMEM;
+        }
+        if (!handle->offset) {
+                init_header((struct swsusp_info *)buffer);
+                handle->buffer = buffer;
+                handle->pbe = pagedir_nosave;
+        }
+        if (handle->prev < handle->page) {
+                if (handle->page <= nr_meta_pages) {
+                        handle->pbe = pack_orig_addresses(buffer, handle->pbe);
+                        if (!handle->pbe)
+                                handle->pbe = pagedir_nosave;
+                } else {
+                        handle->buffer = (void *)handle->pbe->address;
+                        handle->pbe = handle->pbe->next;
+                }
+                handle->prev = handle->page;
+        }
+        handle->buf_offset = handle->page_offset;
+        if (handle->page_offset + count >= PAGE_SIZE) {
+                count = PAGE_SIZE - handle->page_offset;
+                handle->page_offset = 0;
+                handle->page++;
+        } else {
+                handle->page_offset += count;
+        }
+        handle->offset += count;
+        return count;
+}
+/**
+ *      mark_unsafe_pages - mark the pages that cannot be used for storing
+ *      the image during resume, because they conflict with the pages that
+ *      had been used before suspend
+ */
+static int mark_unsafe_pages(struct pbe *pblist)
+{
+        struct zone *zone;
+        unsigned long zone_pfn;
+        struct pbe *p;
+        if (!pblist) /* a sanity check */
+                return -EINVAL;
+        /* Clear page flags */
+        for_each_zone (zone) {
+                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+                        if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+                                ClearPageNosaveFree(pfn_to_page(zone_pfn +
+                                        zone->zone_start_pfn));
+        }
+        /* Mark orig addresses */
+        for_each_pbe (p, pblist) {
+                if (virt_addr_valid(p->orig_address))
+                        SetPageNosaveFree(virt_to_page(p->orig_address));
+                else
+                        return -EFAULT;
+        }
+        return 0;
+}
+static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+{
+        /* We assume both lists contain the same number of elements */
+        while (src) {
+                dst->orig_address = src->orig_address;
+                dst = dst->next;
+                src = src->next;
+        }
+}
+static int check_header(struct swsusp_info *info)
+{
+        char *reason = NULL;
+        if (info->version_code != LINUX_VERSION_CODE)
+                reason = "kernel version";
+        if (info->num_physpages != num_physpages)
+                reason = "memory size";
+        if (strcmp(info->uts.sysname,system_utsname.sysname))
+                reason = "system type";
+        if (strcmp(info->uts.release,system_utsname.release))
+                reason = "kernel release";
+        if (strcmp(info->uts.version,system_utsname.version))
+                reason = "version";
+        if (strcmp(info->uts.machine,system_utsname.machine))
+                reason = "machine";
+        if (reason) {
+                printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
+                return -EPERM;
+        }
+        return 0;
+}
+/**
+ *      load header - check the image header and copy data from it
+ */
+static int load_header(struct snapshot_handle *handle,
+                              struct swsusp_info *info)
+{
+        int error;
+        struct pbe *pblist;
+        error = check_header(info);
+        if (!error) {
+                pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
+                if (!pblist)
+                        return -ENOMEM;
+                pagedir_nosave = pblist;
+                handle->pbe = pblist;
+                nr_copy_pages = info->image_pages;
+                nr_meta_pages = info->pages - info->image_pages - 1;
+        }
+        return error;
+}
+/**
+ *      unpack_orig_addresses - copy the elements of @buf[] (1 page) to
+ *      the PBEs in the list starting at @pbe
+ */
+static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
+                                                struct pbe *pbe)
+{
+        int j;
+        for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+                pbe->orig_address = buf[j];
+                pbe = pbe->next;
+        }
+        return pbe;
+}
+/**
+ *      create_image - use metadata contained in the PBE list
+ *      pointed to by pagedir_nosave to mark the pages that will
+ *      be overwritten in the process of restoring the system
+ *      memory state from the image and allocate memory for
+ *      the image avoiding these pages
+ */
+static int create_image(struct snapshot_handle *handle)
+{
+        int error = 0;
+        struct pbe *p, *pblist;
+        p = pagedir_nosave;
+        error = mark_unsafe_pages(p);
+        if (!error) {
+                pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+                if (pblist)
+                        copy_page_backup_list(pblist, p);
+                free_pagedir(p, 0);
+                if (!pblist)
+                        error = -ENOMEM;
+        }
+        if (!error)
+                error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+        if (!error) {
+                release_eaten_pages();
+                pagedir_nosave = pblist;
+        } else {
+                pagedir_nosave = NULL;
+                handle->pbe = NULL;
+                nr_copy_pages = 0;
+                nr_meta_pages = 0;
+        }
+        return error;
+}
+/**
+ *      snapshot_write_next - used for writing the system memory snapshot.
+ *
+ *      On the first call to it @handle should point to a zeroed
+ *      snapshot_handle structure.  The structure gets updated and a pointer
+ *      to it should be passed to this function every next time.
+ *
+ *      The @count parameter should contain the number of bytes the caller
+ *      wants to write to the image.  It must not be zero.
+ *
+ *      On success the function returns a positive number.  Then, the caller
+ *      is allowed to write up to the returned number of bytes to the memory
+ *      location computed by the data_of() macro.  The number returned
+ *      may be smaller than @count, but this only happens if the write would
+ *      cross a page boundary otherwise.
+ *
+ *      The function returns 0 to indicate the "end of file" condition,
+ *      and a negative number is returned on error.  In such cases the
+ *      structure pointed to by @handle is not updated and should not be used
+ *      any more.
+ */
+int snapshot_write_next(struct snapshot_handle *handle, size_t count)
+{
+        int error = 0;
+        if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
+                return 0;
+        if (!buffer) {
+                /* This makes the buffer be freed by swsusp_free() */
+                buffer = alloc_image_page(GFP_ATOMIC, 0);
+                if (!buffer)
+                        return -ENOMEM;
+        }
+        if (!handle->offset)
+                handle->buffer = buffer;
+        if (handle->prev < handle->page) {
+                if (!handle->prev) {
+                        error = load_header(handle, (struct swsusp_info *)buffer);
+                        if (error)
+                                return error;
+                } else if (handle->prev <= nr_meta_pages) {
+                        handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
+                        if (!handle->pbe) {
+                                error = create_image(handle);
+                                if (error)
+                                        return error;
+                                handle->pbe = pagedir_nosave;
+                                handle->buffer = (void *)handle->pbe->address;
+                        }
+                } else {
+                        handle->pbe = handle->pbe->next;
+                        handle->buffer = (void *)handle->pbe->address;
+                }
+                handle->prev = handle->page;
+        }
+        handle->buf_offset = handle->page_offset;
+        if (handle->page_offset + count >= PAGE_SIZE) {
+                count = PAGE_SIZE - handle->page_offset;
+                handle->page_offset = 0;
+                handle->page++;
+        } else {
+                handle->page_offset += count;
+        }
+        handle->offset += count;
+        return count;
+}
+int snapshot_image_loaded(struct snapshot_handle *handle)
+{
+        return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
+                handle->page <= nr_meta_pages + nr_copy_pages);
+}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
new file mode 100644
index 0000000000..044b8e0c10
--- /dev/null
+++ b/kernel/power/swap.c
@@ -0,0 +1,545 @@
+/*
+ * linux/kernel/power/swap.c
+ *
+ * This file provides functions for reading the suspend image from
+ * and writing it to a swap partition.
+ *
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+#include <linux/module.h>
+#include <linux/smp_lock.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/genhd.h>
+#include <linux/device.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+#include "power.h"
+extern char resume_file[];
+#define SWSUSP_SIG      "S1SUSPEND"
+static struct swsusp_header {
+        char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
+        swp_entry_t image;
+        char    orig_sig[10];
+        char    sig[10];
+} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
+/*
+ * Saving part...
+ */
+static unsigned short root_swap = 0xffff;
+static int mark_swapfiles(swp_entry_t start)
+{
+        int error;
+        rw_swap_page_sync(READ,
+                          swp_entry(root_swap, 0),
+                          virt_to_page((unsigned long)&swsusp_header));
+        if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
+            !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
+                memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
+                memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
+                swsusp_header.image = start;
+                error = rw_swap_page_sync(WRITE,
+                                          swp_entry(root_swap, 0),
+                                          virt_to_page((unsigned long)
+                                                       &swsusp_header));
+        } else {
+                pr_debug("swsusp: Partition is not swap space.\n");
+                error = -ENODEV;
+        }
+        return error;
+}
+/**
+ *      swsusp_swap_check - check if the resume device is a swap device
+ *      and get its index (if so)
+ */
+static int swsusp_swap_check(void) /* This is called before saving image */
+{
+        int res = swap_type_of(swsusp_resume_device);
+        if (res >= 0) {
+                root_swap = res;
+                return 0;
+        }
+        return res;
+}
+/**
+ *      write_page - Write one page to given swap location.
+ *      @buf:           Address we're writing.
+ *      @offset:        Offset of the swap page we're writing to.
+ */
+static int write_page(void *buf, unsigned long offset)
+{
+        swp_entry_t entry;
+        int error = -ENOSPC;
+        if (offset) {
+                entry = swp_entry(root_swap, offset);
+                error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
+        }
+        return error;
+}
+/*
+ *      The swap map is a data structure used for keeping track of each page
+ *      written to a swap partition.  It consists of many swap_map_page
+ *      structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ *      These structures are stored on the swap and linked together with the
+ *      help of the .next_swap member.
+ *
+ *      The swap map is created during suspend.  The swap map pages are
+ *      allocated and populated one at a time, so we only need one memory
+ *      page to set up the entire structure.
+ *
+ *      During resume we also only need to use one swap_map_page structure
+ *      at a time.
+ */
+#define MAP_PAGE_ENTRIES        (PAGE_SIZE / sizeof(long) - 1)
+struct swap_map_page {
+        unsigned long           entries[MAP_PAGE_ENTRIES];
+        unsigned long           next_swap;
+};
+/**
+ *      The swap_map_handle structure is used for handling swap in
+ *      a file-alike way
+ */
+struct swap_map_handle {
+        struct swap_map_page *cur;
+        unsigned long cur_swap;
+        struct bitmap_page *bitmap;
+        unsigned int k;
+};
+static void release_swap_writer(struct swap_map_handle *handle)
+{
+        if (handle->cur)
+                free_page((unsigned long)handle->cur);
+        handle->cur = NULL;
+        if (handle->bitmap)
+                free_bitmap(handle->bitmap);
+        handle->bitmap = NULL;
+}
+static int get_swap_writer(struct swap_map_handle *handle)
+{
+        handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
+        if (!handle->cur)
+                return -ENOMEM;
+        handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
+        if (!handle->bitmap) {
+                release_swap_writer(handle);
+                return -ENOMEM;
+        }
+        handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+        if (!handle->cur_swap) {
+                release_swap_writer(handle);
+                return -ENOSPC;
+        }
+        handle->k = 0;
+        return 0;
+}
+static int swap_write_page(struct swap_map_handle *handle, void *buf)
+{
+        int error;
+        unsigned long offset;
+        if (!handle->cur)
+                return -EINVAL;
+        offset = alloc_swap_page(root_swap, handle->bitmap);
+        error = write_page(buf, offset);
+        if (error)
+                return error;
+        handle->cur->entries[handle->k++] = offset;
+        if (handle->k >= MAP_PAGE_ENTRIES) {
+                offset = alloc_swap_page(root_swap, handle->bitmap);
+                if (!offset)
+                        return -ENOSPC;
+                handle->cur->next_swap = offset;
+                error = write_page(handle->cur, handle->cur_swap);
+                if (error)
+                        return error;
+                memset(handle->cur, 0, PAGE_SIZE);
+                handle->cur_swap = offset;
+                handle->k = 0;
+        }
+        return 0;
+}
+static int flush_swap_writer(struct swap_map_handle *handle)
+{
+        if (handle->cur && handle->cur_swap)
+                return write_page(handle->cur, handle->cur_swap);
+        else
+                return -EINVAL;
+}
+/**
+ *      save_image - save the suspend image data
+ */
+static int save_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
+{
+        unsigned int m;
+        int ret;
+        int error = 0;
+        printk("Saving image data pages (%u pages) ...     ", nr_pages);
+        m = nr_pages / 100;
+        if (!m)
+                m = 1;
+        nr_pages = 0;
+        do {
+                ret = snapshot_read_next(snapshot, PAGE_SIZE);
+                if (ret > 0) {
+                        error = swap_write_page(handle, data_of(*snapshot));
+                        if (error)
+                                break;
+                        if (!(nr_pages % m))
+                                printk("\b\b\b\b%3d%%", nr_pages / m);
+                        nr_pages++;
+                }
+        } while (ret > 0);
+        if (!error)
+                printk("\b\b\b\bdone\n");
+        return error;
+}
+/**
+ *      enough_swap - Make sure we have enough swap to save the image.
+ *
+ *      Returns TRUE or FALSE after checking the total amount of swap
+ *      space avaiable from the resume partition.
+ */
+static int enough_swap(unsigned int nr_pages)
+{
+        unsigned int free_swap = count_swap_pages(root_swap, 1);
+        pr_debug("swsusp: free swap pages: %u\n", free_swap);
+        return free_swap > (nr_pages + PAGES_FOR_IO +
+                (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+}
+/**
+ *      swsusp_write - Write entire image and metadata.
+ *
+ *      It is important _NOT_ to umount filesystems at this point. We want
+ *      them synced (in case something goes wrong) but we DO not want to mark
+ *      filesystem clean: it is not. (And it does not matter, if we resume
+ *      correctly, we'll mark system clean, anyway.)
+ */
+int swsusp_write(void)
+{
+        struct swap_map_handle handle;
+        struct snapshot_handle snapshot;
+        struct swsusp_info *header;
+        unsigned long start;
+        int error;
+        if ((error = swsusp_swap_check())) {
+                printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
+                return error;
+        }
+        memset(&snapshot, 0, sizeof(struct snapshot_handle));
+        error = snapshot_read_next(&snapshot, PAGE_SIZE);
+        if (error < PAGE_SIZE)
+                return error < 0 ? error : -EFAULT;
+        header = (struct swsusp_info *)data_of(snapshot);
+        if (!enough_swap(header->pages)) {
+                printk(KERN_ERR "swsusp: Not enough free swap\n");
+                return -ENOSPC;
+        }
+        error = get_swap_writer(&handle);
+        if (!error) {
+                start = handle.cur_swap;
+                error = swap_write_page(&handle, header);
+        }
+        if (!error)
+                error = save_image(&handle, &snapshot, header->pages - 1);
+        if (!error) {
+                flush_swap_writer(&handle);
+                printk("S");
+                error = mark_swapfiles(swp_entry(root_swap, start));
+                printk("|\n");
+        }
+        if (error)
+                free_all_swap_pages(root_swap, handle.bitmap);
+        release_swap_writer(&handle);
+        return error;
+}
+/*
+ *      Using bio to read from swap.
+ *      This code requires a bit more work than just using buffer heads
+ *      but, it is the recommended way for 2.5/2.6.
+ *      The following are to signal the beginning and end of I/O. Bios
+ *      finish asynchronously, while we want them to happen synchronously.
+ *      A simple atomic_t, and a wait loop take care of this problem.
+ */
+static atomic_t io_done = ATOMIC_INIT(0);
+static int end_io(struct bio *bio, unsigned int num, int err)
+{
+        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+                panic("I/O error reading memory image");
+        atomic_set(&io_done, 0);
+        return 0;
+}
+static struct block_device *resume_bdev;
+/**
+ *      submit - submit BIO request.
+ *      @rw:    READ or WRITE.
+ *      @off    physical offset of page.
+ *      @page:  page we're reading or writing.
+ *
+ *      Straight from the textbook - allocate and initialize the bio.
+ *      If we're writing, make sure the page is marked as dirty.
+ *      Then submit it and wait.
+ */
+static int submit(int rw, pgoff_t page_off, void *page)
+{
+        int error = 0;
+        struct bio *bio;
+        bio = bio_alloc(GFP_ATOMIC, 1);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+        bio->bi_bdev = resume_bdev;
+        bio->bi_end_io = end_io;
+        if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
+                printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
+                error = -EFAULT;
+                goto Done;
+        }
+        atomic_set(&io_done, 1);
+        submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+        while (atomic_read(&io_done))
+                yield();
+        if (rw == READ)
+                bio_set_pages_dirty(bio);
+ Done:
+        bio_put(bio);
+        return error;
+}
+static int bio_read_page(pgoff_t page_off, void *page)
+{
+        return submit(READ, page_off, page);
+}
+static int bio_write_page(pgoff_t page_off, void *page)
+{
+        return submit(WRITE, page_off, page);
+}
+/**
+ *      The following functions allow us to read data using a swap map
+ *      in a file-alike way
+ */
+static void release_swap_reader(struct swap_map_handle *handle)
+{
+        if (handle->cur)
+                free_page((unsigned long)handle->cur);
+        handle->cur = NULL;
+}
+static int get_swap_reader(struct swap_map_handle *handle,
+                                      swp_entry_t start)
+{
+        int error;
+        if (!swp_offset(start))
+                return -EINVAL;
+        handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+        if (!handle->cur)
+                return -ENOMEM;
+        error = bio_read_page(swp_offset(start), handle->cur);
+        if (error) {
+                release_swap_reader(handle);
+                return error;
+        }
+        handle->k = 0;
+        return 0;
+}
+static int swap_read_page(struct swap_map_handle *handle, void *buf)
+{
+        unsigned long offset;
+        int error;
+        if (!handle->cur)
+                return -EINVAL;
+        offset = handle->cur->entries[handle->k];
+        if (!offset)
+                return -EFAULT;
+        error = bio_read_page(offset, buf);
+        if (error)
+                return error;
+        if (++handle->k >= MAP_PAGE_ENTRIES) {
+                handle->k = 0;
+                offset = handle->cur->next_swap;
+                if (!offset)
+                        release_swap_reader(handle);
+                else
+                        error = bio_read_page(offset, handle->cur);
+        }
+        return error;
+}
+/**
+ *      load_image - load the image using the swap map handle
+ *      @handle and the snapshot handle @snapshot
+ *      (assume there are @nr_pages pages to load)
+ */
+static int load_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
+{
+        unsigned int m;
+        int ret;
+        int error = 0;
+        printk("Loading image data pages (%u pages) ...     ", nr_pages);
+        m = nr_pages / 100;
+        if (!m)
+                m = 1;
+        nr_pages = 0;
+        do {
+                ret = snapshot_write_next(snapshot, PAGE_SIZE);
+                if (ret > 0) {
+                        error = swap_read_page(handle, data_of(*snapshot));
+                        if (error)
+                                break;
+                        if (!(nr_pages % m))
+                                printk("\b\b\b\b%3d%%", nr_pages / m);
+                        nr_pages++;
+                }
+        } while (ret > 0);
+        if (!error) {
+                printk("\b\b\b\bdone\n");
+                if (!snapshot_image_loaded(snapshot))
+                        error = -ENODATA;
+        }
+        return error;
+}
+int swsusp_read(void)
+{
+        int error;
+        struct swap_map_handle handle;
+        struct snapshot_handle snapshot;
+        struct swsusp_info *header;
+        if (IS_ERR(resume_bdev)) {
+                pr_debug("swsusp: block device not initialised\n");
+                return PTR_ERR(resume_bdev);
+        }
+        memset(&snapshot, 0, sizeof(struct snapshot_handle));
+        error = snapshot_write_next(&snapshot, PAGE_SIZE);
+        if (error < PAGE_SIZE)
+                return error < 0 ? error : -EFAULT;
+        header = (struct swsusp_info *)data_of(snapshot);
+        error = get_swap_reader(&handle, swsusp_header.image);
+        if (!error)
+                error = swap_read_page(&handle, header);
+        if (!error)
+                error = load_image(&handle, &snapshot, header->pages - 1);
+        release_swap_reader(&handle);
+        blkdev_put(resume_bdev);
+        if (!error)
+                pr_debug("swsusp: Reading resume file was successful\n");
+        else
+                pr_debug("swsusp: Error %d resuming\n", error);
+        return error;
+}
+/**
+ *      swsusp_check - Check for swsusp signature in the resume device
+ */
+int swsusp_check(void)
+{
+        int error;
+        resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+        if (!IS_ERR(resume_bdev)) {
+                set_blocksize(resume_bdev, PAGE_SIZE);
+                memset(&swsusp_header, 0, sizeof(swsusp_header));
+                if ((error = bio_read_page(0, &swsusp_header)))
+                        return error;
+                if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
+                        memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+                        /* Reset swap signature now */
+                        error = bio_write_page(0, &swsusp_header);
+                } else {
+                        return -EINVAL;
+                }
+                if (error)
+                        blkdev_put(resume_bdev);
+                else
+                        pr_debug("swsusp: Signature found, resuming\n");
+        } else {
+                error = PTR_ERR(resume_bdev);
+        }
+        if (error)
+                pr_debug("swsusp: Error %d check for resume file\n", error);
+        return error;
+}
+/**
+ *      swsusp_close - close swap device.
+ */
+void swsusp_close(void)
+{
+        if (IS_ERR(resume_bdev)) {
+                pr_debug("swsusp: block device not initialised\n");
+                return;
+        }
+        blkdev_put(resume_bdev);
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 4e90905f0e..c4016cbbd3 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -31,41 +31,24 @@
 * Fixed runaway init
 *
 * Rafael J. Wysocki <rjw@sisk.pl>
- * Added the swap map data structure and reworked the handling of swap
+ * Reworked the freeing of memory and the handling of swap
 *
 * More state savers are welcome. Especially for the scsi layer...
 *
 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
 */
-#include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/suspend.h>
-#include <linux/smp_lock.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <linux/version.h>
-#include <linux/delay.h>
-#include <linux/bitops.h>
 #include <linux/spinlock.h>
-#include <linux/genhd.h>
 #include <linux/kernel.h>
 #include <linux/major.h>
 #include <linux/swap.h>
 #include <linux/pm.h>
-#include <linux/device.h>
-#include <linux/buffer_head.h>
 #include <linux/swapops.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/highmem.h>
-#include <linux/bio.h>
-#include <asm/uaccess.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
 #include "power.h"
@@ -77,6 +60,8 @@
 */
 unsigned long image_size = 500 * 1024 * 1024;
+int in_suspend __nosavedata = 0;
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
 int save_highmem(void);
@@ -87,473 +72,97 @@ static int restore_highmem(void) { return 0; }
 static unsigned int count_highmem_pages(void) { return 0; }
 #endif
-extern char resume_file[];
-#define SWSUSP_SIG      "S1SUSPEND"
-static struct swsusp_header {
-        char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
-        swp_entry_t image;
-        char    orig_sig[10];
-        char    sig[10];
-} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
-static struct swsusp_info swsusp_info;
-/*
- * Saving part...
- */
-static unsigned short root_swap = 0xffff;
-static int mark_swapfiles(swp_entry_t start)
-{
-        int error;
-        rw_swap_page_sync(READ,
-                          swp_entry(root_swap, 0),
-                          virt_to_page((unsigned long)&swsusp_header));
-        if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
-            !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
-                memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
-                memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
-                swsusp_header.image = start;
-                error = rw_swap_page_sync(WRITE,
-                                          swp_entry(root_swap, 0),
-                                          virt_to_page((unsigned long)
-                                                       &swsusp_header));
-        } else {
-                pr_debug("swsusp: Partition is not swap space.\n");
-                error = -ENODEV;
-        }
-        return error;
-}
-/*
- * Check whether the swap device is the specified resume
- * device, irrespective of whether they are specified by
- * identical names.
- *
- * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
- * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
- * and they'll be considered the same device.  This is *necessary* for
- * devfs, since the resume code can only recognize the form /dev/hda4,
- * but the suspend code would see the long name.)
- */
-static inline int is_resume_device(const struct swap_info_struct *swap_info)
-{
-        struct file *file = swap_info->swap_file;
-        struct inode *inode = file->f_dentry->d_inode;
-        return S_ISBLK(inode->i_mode) &&
-                swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
-}
-static int swsusp_swap_check(void) /* This is called before saving image */
-{
-        int i;
-        if (!swsusp_resume_device)
-                return -ENODEV;
-        spin_lock(&swap_lock);
-        for (i = 0; i < MAX_SWAPFILES; i++) {
-                if (!(swap_info[i].flags & SWP_WRITEOK))
-                        continue;
-                if (is_resume_device(swap_info + i)) {
-                        spin_unlock(&swap_lock);
-                        root_swap = i;
-                        return 0;
-                }
-        }
-        spin_unlock(&swap_lock);
-        return -ENODEV;
-}
-/**
- *      write_page - Write one page to a fresh swap location.
- *      @addr:  Address we're writing.
- *      @loc:   Place to store the entry we used.
- *
- *      Allocate a new swap entry and 'sync' it. Note we discard -EIO
- *      errors. That is an artifact left over from swsusp. It did not
- *      check the return of rw_swap_page_sync() at all, since most pages
- *      written back to swap would return -EIO.
- *      This is a partial improvement, since we will at least return other
- *      errors, though we need to eventually fix the damn code.
- */
-static int write_page(unsigned long addr, swp_entry_t *loc)
-{
-        swp_entry_t entry;
-        int error = -ENOSPC;
-        entry = get_swap_page_of_type(root_swap);
-        if (swp_offset(entry)) {
-                error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
-                if (!error || error == -EIO)
-                        *loc = entry;
-        }
-        return error;
-}
 /**
- *      Swap map-handling functions
+ *      The following functions are used for tracing the allocated
- *
+ *      swap pages, so that they can be freed in case of an error.
- *      The swap map is a data structure used for keeping track of each page
- *      written to the swap.  It consists of many swap_map_page structures
- *      that contain each an array of MAP_PAGE_SIZE swap entries.
- *      These structures are linked together with the help of either the
- *      .next (in memory) or the .next_swap (in swap) member.
 *
- *      The swap map is created during suspend.  At that time we need to keep
+ *      The functions operate on a linked bitmap structure defined
- *      it in memory, because we have to free all of the allocated swap
+ *      in power.h
- *      entries if an error occurs.  The memory needed is preallocated
- *      so that we know in advance if there's enough of it.
- *
- *      The first swap_map_page structure is filled with the swap entries that
- *      correspond to the first MAP_PAGE_SIZE data pages written to swap and
- *      so on.  After the all of the data pages have been written, the order
- *      of the swap_map_page structures in the map is reversed so that they
- *      can be read from swap in the original order.  This causes the data
- *      pages to be loaded in exactly the same order in which they have been
- *      saved.
- *
- *      During resume we only need to use one swap_map_page structure
- *      at a time, which means that we only need to use two memory pages for
- *      reading the image - one for reading the swap_map_page structures
- *      and the second for reading the data pages from swap.
 */
-#define MAP_PAGE_SIZE   ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
+void free_bitmap(struct bitmap_page *bitmap)
-                        / sizeof(swp_entry_t))
-struct swap_map_page {
-        swp_entry_t             entries[MAP_PAGE_SIZE];
-        swp_entry_t             next_swap;
-        struct swap_map_page    *next;
-};
-static inline void free_swap_map(struct swap_map_page *swap_map)
 {
-        struct swap_map_page *swp;
+        struct bitmap_page *bp;
-        while (swap_map) {
+        while (bitmap) {
-                swp = swap_map->next;
+                bp = bitmap->next;
-                free_page((unsigned long)swap_map);
+                free_page((unsigned long)bitmap);
-                swap_map = swp;
+                bitmap = bp;
        }
 }
-static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
+struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
 {
-        struct swap_map_page *swap_map, *swp;
+        struct bitmap_page *bitmap, *bp;
-        unsigned n = 0;
+        unsigned int n;
-        if (!nr_pages)
+        if (!nr_bits)
                return NULL;
-        pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
+        bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
-        swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+        bp = bitmap;
-        swp = swap_map;
+        for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
-        for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
+                bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
-                swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+                bp = bp->next;
-                swp = swp->next;
+                if (!bp) {
-                if (!swp) {
+                        free_bitmap(bitmap);
-                        free_swap_map(swap_map);
                        return NULL;
                }
        }
-        return swap_map;
+        return bitmap;
 }
-/**
+static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
- *      reverse_swap_map - reverse the order of pages in the swap map
- *      @swap_map
- */
-static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
-{
-        struct swap_map_page *prev, *next;
-        prev = NULL;
-        while (swap_map) {
-                next = swap_map->next;
-                swap_map->next = prev;
-                prev = swap_map;
-                swap_map = next;
-        }
-        return prev;
-}
-/**
- *      free_swap_map_entries - free the swap entries allocated to store
- *      the swap map @swap_map (this is only called in case of an error)
- */
-static inline void free_swap_map_entries(struct swap_map_page *swap_map)
-{
-        while (swap_map) {
-                if (swap_map->next_swap.val)
-                        swap_free(swap_map->next_swap);
-                swap_map = swap_map->next;
-        }
-}
-/**
- *      save_swap_map - save the swap map used for tracing the data pages
- *      stored in the swap
- */
-static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
-{
-        swp_entry_t entry = (swp_entry_t){0};
-        int error;
-        while (swap_map) {
-                swap_map->next_swap = entry;
-                if ((error = write_page((unsigned long)swap_map, &entry)))
-                        return error;
-                swap_map = swap_map->next;
-        }
-        *start = entry;
-        return 0;
-}
-/**
- *      free_image_entries - free the swap entries allocated to store
- *      the image data pages (this is only called in case of an error)
- */
-static inline void free_image_entries(struct swap_map_page *swp)
 {
-        unsigned k;
+        unsigned int n;
-        while (swp) {
+        n = BITMAP_PAGE_BITS;
-                for (k = 0; k < MAP_PAGE_SIZE; k++)
+        while (bitmap && n <= bit) {
-                        if (swp->entries[k].val)
+                n += BITMAP_PAGE_BITS;
-                                swap_free(swp->entries[k]);
+                bitmap = bitmap->next;
-                swp = swp->next;
        }
-}
+        if (!bitmap)
+                return -EINVAL;
-/**
+        n -= BITMAP_PAGE_BITS;
- *      The swap_map_handle structure is used for handling the swap map in
+        bit -= n;
- *      a file-alike way
+        n = 0;
- */
+        while (bit >= BITS_PER_CHUNK) {
+                bit -= BITS_PER_CHUNK;
-struct swap_map_handle {
+                n++;
-        struct swap_map_page *cur;
-        unsigned int k;
-};
-static inline void init_swap_map_handle(struct swap_map_handle *handle,
-                                        struct swap_map_page *map)
-{
-        handle->cur = map;
-        handle->k = 0;
-}
-static inline int swap_map_write_page(struct swap_map_handle *handle,
-                                      unsigned long addr)
-{
-        int error;
-        error = write_page(addr, handle->cur->entries + handle->k);
-        if (error)
-                return error;
-        if (++handle->k >= MAP_PAGE_SIZE) {
-                handle->cur = handle->cur->next;
-                handle->k = 0;
        }
+        bitmap->chunks[n] |= (1UL << bit);
        return 0;
 }
-/**
+unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
- *      save_image_data - save the data pages pointed to by the PBEs
- *      from the list @pblist using the swap map handle @handle
- *      (assume there are @nr_pages data pages to save)
- */
-static int save_image_data(struct pbe *pblist,
-                           struct swap_map_handle *handle,
-                           unsigned int nr_pages)
-{
-        unsigned int m;
-        struct pbe *p;
-        int error = 0;
-        printk("Saving image data pages (%u pages) ...     ", nr_pages);
-        m = nr_pages / 100;
-        if (!m)
-                m = 1;
-        nr_pages = 0;
-        for_each_pbe (p, pblist) {
-                error = swap_map_write_page(handle, p->address);
-                if (error)
-                        break;
-                if (!(nr_pages % m))
-                        printk("\b\b\b\b%3d%%", nr_pages / m);
-                nr_pages++;
-        }
-        if (!error)
-                printk("\b\b\b\bdone\n");
-        return error;
-}
-static void dump_info(void)
-{
-        pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
-        pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
-        pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
-        pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
-        pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
-        pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
-        pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
-        pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
-        pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
-        pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
-        pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
-}
-static void init_header(unsigned int nr_pages)
-{
-        memset(&swsusp_info, 0, sizeof(swsusp_info));
-        swsusp_info.version_code = LINUX_VERSION_CODE;
-        swsusp_info.num_physpages = num_physpages;
-        memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
-        swsusp_info.cpus = num_online_cpus();
-        swsusp_info.image_pages = nr_pages;
-        swsusp_info.pages = nr_pages +
-                ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
-}
-/**
- *      pack_orig_addresses - the .orig_address fields of the PBEs from the
- *      list starting at @pbe are stored in the array @buf[] (1 page)
- */
-static inline struct pbe *pack_orig_addresses(unsigned long *buf,
-                                              struct pbe *pbe)
-{
-        int j;
-        for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-                buf[j] = pbe->orig_address;
-                pbe = pbe->next;
-        }
-        if (!pbe)
-                for (; j < PAGE_SIZE / sizeof(long); j++)
-                        buf[j] = 0;
-        return pbe;
-}
-/**
- *      save_image_metadata - save the .orig_address fields of the PBEs
- *      from the list @pblist using the swap map handle @handle
- */
-static int save_image_metadata(struct pbe *pblist,
-                               struct swap_map_handle *handle)
 {
-        unsigned long *buf;
+        unsigned long offset;
-        unsigned int n = 0;
-        struct pbe *p;
-        int error = 0;
-        printk("Saving image metadata ... ");
+        offset = swp_offset(get_swap_page_of_type(swap));
-        buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
+        if (offset) {
-        if (!buf)
+                if (bitmap_set(bitmap, offset)) {
-                return -ENOMEM;
+                        swap_free(swp_entry(swap, offset));
-        p = pblist;
+                        offset = 0;
-        while (p) {
+                }
-                p = pack_orig_addresses(buf, p);
-                error = swap_map_write_page(handle, (unsigned long)buf);
-                if (error)
-                        break;
-                n++;
        }
-        free_page((unsigned long)buf);
+        return offset;
-        if (!error)
-                printk("done (%u pages saved)\n", n);
-        return error;
 }
-/**
+void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
- *      enough_swap - Make sure we have enough swap to save the image.
- *
- *      Returns TRUE or FALSE after checking the total amount of swap
- *      space avaiable from the resume partition.
- */
-static int enough_swap(unsigned int nr_pages)
 {
-        unsigned int free_swap = swap_info[root_swap].pages -
+        unsigned int bit, n;
-                swap_info[root_swap].inuse_pages;
+        unsigned long test;
-        pr_debug("swsusp: free swap pages: %u\n", free_swap);
-        return free_swap > (nr_pages + PAGES_FOR_IO +
-                (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
-}
-/**
+        bit = 0;
- *      swsusp_write - Write entire image and metadata.
+        while (bitmap) {
- *
+                for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
- *      It is important _NOT_ to umount filesystems at this point. We want
+                        for (test = 1UL; test; test <<= 1) {
- *      them synced (in case something goes wrong) but we DO not want to mark
+                                if (bitmap->chunks[n] & test)
- *      filesystem clean: it is not. (And it does not matter, if we resume
+                                        swap_free(swp_entry(swap, bit));
- *      correctly, we'll mark system clean, anyway.)
+                                bit++;
- */
+                        }
+                bitmap = bitmap->next;
-int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
-{
-        struct swap_map_page *swap_map;
-        struct swap_map_handle handle;
-        swp_entry_t start;
-        int error;
-        if ((error = swsusp_swap_check())) {
-                printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
-                return error;
-        }
-        if (!enough_swap(nr_pages)) {
-                printk(KERN_ERR "swsusp: Not enough free swap\n");
-                return -ENOSPC;
        }
-        init_header(nr_pages);
-        swap_map = alloc_swap_map(swsusp_info.pages);
-        if (!swap_map)
-                return -ENOMEM;
-        init_swap_map_handle(&handle, swap_map);
-        error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
-        if (!error)
-                error = save_image_metadata(pblist, &handle);
-        if (!error)
-                error = save_image_data(pblist, &handle, nr_pages);
-        if (error)
-                goto Free_image_entries;
-        swap_map = reverse_swap_map(swap_map);
-        error = save_swap_map(swap_map, &start);
-        if (error)
-                goto Free_map_entries;
-        dump_info();
-        printk( "S" );
-        error = mark_swapfiles(start);
-        printk( "|\n" );
-        if (error)
-                goto Free_map_entries;
-Free_swap_map:
-        free_swap_map(swap_map);
-        return error;
-Free_map_entries:
-        free_swap_map_entries(swap_map);
-Free_image_entries:
-        free_image_entries(swap_map);
-        goto Free_swap_map;
 }
 /**
@@ -662,379 +271,3 @@ int swsusp_resume(void)
        local_irq_enable();
        return error;
 }
-/**
- *      mark_unsafe_pages - mark the pages that cannot be used for storing
- *      the image during resume, because they conflict with the pages that
- *      had been used before suspend
- */
-static void mark_unsafe_pages(struct pbe *pblist)
-{
-        struct zone *zone;
-        unsigned long zone_pfn;
-        struct pbe *p;
-        if (!pblist) /* a sanity check */
-                return;
-        /* Clear page flags */
-        for_each_zone (zone) {
-                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-                        if (pfn_valid(zone_pfn + zone->zone_start_pfn))
-                                ClearPageNosaveFree(pfn_to_page(zone_pfn +
-                                        zone->zone_start_pfn));
-        }
-        /* Mark orig addresses */
-        for_each_pbe (p, pblist)
-                SetPageNosaveFree(virt_to_page(p->orig_address));
-}
-static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
-{
-        /* We assume both lists contain the same number of elements */
-        while (src) {
-                dst->orig_address = src->orig_address;
-                dst = dst->next;
-                src = src->next;
-        }
-}
-/*
- *      Using bio to read from swap.
- *      This code requires a bit more work than just using buffer heads
- *      but, it is the recommended way for 2.5/2.6.
- *      The following are to signal the beginning and end of I/O. Bios
- *      finish asynchronously, while we want them to happen synchronously.
- *      A simple atomic_t, and a wait loop take care of this problem.
- */
-static atomic_t io_done = ATOMIC_INIT(0);
-static int end_io(struct bio *bio, unsigned int num, int err)
-{
-        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-                panic("I/O error reading memory image");
-        atomic_set(&io_done, 0);
-        return 0;
-}
-static struct block_device *resume_bdev;
-/**
- *      submit - submit BIO request.
- *      @rw:    READ or WRITE.
- *      @off    physical offset of page.
- *      @page:  page we're reading or writing.
- *
- *      Straight from the textbook - allocate and initialize the bio.
- *      If we're writing, make sure the page is marked as dirty.
- *      Then submit it and wait.
- */
-static int submit(int rw, pgoff_t page_off, void *page)
-{
-        int error = 0;
-        struct bio *bio;
-        bio = bio_alloc(GFP_ATOMIC, 1);
-        if (!bio)
-                return -ENOMEM;
-        bio->bi_sector = page_off * (PAGE_SIZE >> 9);
-        bio->bi_bdev = resume_bdev;
-        bio->bi_end_io = end_io;
-        if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
-                printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
-                error = -EFAULT;
-                goto Done;
-        }
-        atomic_set(&io_done, 1);
-        submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-        while (atomic_read(&io_done))
-                yield();
-        if (rw == READ)
-                bio_set_pages_dirty(bio);
- Done:
-        bio_put(bio);
-        return error;
-}
-static int bio_read_page(pgoff_t page_off, void *page)
-{
-        return submit(READ, page_off, page);
-}
-static int bio_write_page(pgoff_t page_off, void *page)
-{
-        return submit(WRITE, page_off, page);
-}
-/**
- *      The following functions allow us to read data using a swap map
- *      in a file-alike way
- */
-static inline void release_swap_map_reader(struct swap_map_handle *handle)
-{
-        if (handle->cur)
-                free_page((unsigned long)handle->cur);
-        handle->cur = NULL;
-}
-static inline int get_swap_map_reader(struct swap_map_handle *handle,
-                                      swp_entry_t start)
-{
-        int error;
-        if (!swp_offset(start))
-                return -EINVAL;
-        handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
-        if (!handle->cur)
-                return -ENOMEM;
-        error = bio_read_page(swp_offset(start), handle->cur);
-        if (error) {
-                release_swap_map_reader(handle);
-                return error;
-        }
-        handle->k = 0;
-        return 0;
-}
-static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
-{
-        unsigned long offset;
-        int error;
-        if (!handle->cur)
-                return -EINVAL;
-        offset = swp_offset(handle->cur->entries[handle->k]);
-        if (!offset)
-                return -EINVAL;
-        error = bio_read_page(offset, buf);
-        if (error)
-                return error;
-        if (++handle->k >= MAP_PAGE_SIZE) {
-                handle->k = 0;
-                offset = swp_offset(handle->cur->next_swap);
-                if (!offset)
-                        release_swap_map_reader(handle);
-                else
-                        error = bio_read_page(offset, handle->cur);
-        }
-        return error;
-}
-static int check_header(void)
-{
-        char *reason = NULL;
-        dump_info();
-        if (swsusp_info.version_code != LINUX_VERSION_CODE)
-                reason = "kernel version";
-        if (swsusp_info.num_physpages != num_physpages)
-                reason = "memory size";
-        if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
-                reason = "system type";
-        if (strcmp(swsusp_info.uts.release,system_utsname.release))
-                reason = "kernel release";
-        if (strcmp(swsusp_info.uts.version,system_utsname.version))
-                reason = "version";
-        if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
-                reason = "machine";
-        if (reason) {
-                printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
-                return -EPERM;
-        }
-        return 0;
-}
-/**
- *      load_image_data - load the image data using the swap map handle
- *      @handle and store them using the page backup list @pblist
- *      (assume there are @nr_pages pages to load)
- */
-static int load_image_data(struct pbe *pblist,
-                           struct swap_map_handle *handle,
-                           unsigned int nr_pages)
-{
-        int error;
-        unsigned int m;
-        struct pbe *p;
-        if (!pblist)
-                return -EINVAL;
-        printk("Loading image data pages (%u pages) ...     ", nr_pages);
-        m = nr_pages / 100;
-        if (!m)
-                m = 1;
-        nr_pages = 0;
-        p = pblist;
-        while (p) {
-                error = swap_map_read_page(handle, (void *)p->address);
-                if (error)
-                        break;
-                p = p->next;
-                if (!(nr_pages % m))
-                        printk("\b\b\b\b%3d%%", nr_pages / m);
-                nr_pages++;
-        }
-        if (!error)
-                printk("\b\b\b\bdone\n");
-        return error;
-}
-/**
- *      unpack_orig_addresses - copy the elements of @buf[] (1 page) to
- *      the PBEs in the list starting at @pbe
- */
-static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
-                                                struct pbe *pbe)
-{
-        int j;
-        for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-                pbe->orig_address = buf[j];
-                pbe = pbe->next;
-        }
-        return pbe;
-}
-/**
- *      load_image_metadata - load the image metadata using the swap map
- *      handle @handle and put them into the PBEs in the list @pblist
- */
-static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
-{
-        struct pbe *p;
-        unsigned long *buf;
-        unsigned int n = 0;
-        int error = 0;
-        printk("Loading image metadata ... ");
-        buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
-        if (!buf)
-                return -ENOMEM;
-        p = pblist;
-        while (p) {
-                error = swap_map_read_page(handle, buf);
-                if (error)
-                        break;
-                p = unpack_orig_addresses(buf, p);
-                n++;
-        }
-        free_page((unsigned long)buf);
-        if (!error)
-                printk("done (%u pages loaded)\n", n);
-        return error;
-}
-int swsusp_read(struct pbe **pblist_ptr)
-{
-        int error;
-        struct pbe *p, *pblist;
-        struct swap_map_handle handle;
-        unsigned int nr_pages;
-        if (IS_ERR(resume_bdev)) {
-                pr_debug("swsusp: block device not initialised\n");
-                return PTR_ERR(resume_bdev);
-        }
-        error = get_swap_map_reader(&handle, swsusp_header.image);
-        if (!error)
-                error = swap_map_read_page(&handle, &swsusp_info);
-        if (!error)
-                error = check_header();
-        if (error)
-                return error;
-        nr_pages = swsusp_info.image_pages;
-        p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
-        if (!p)
-                return -ENOMEM;
-        error = load_image_metadata(p, &handle);
-        if (!error) {
-                mark_unsafe_pages(p);
-                pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
-                if (pblist)
-                        copy_page_backup_list(pblist, p);
-                free_pagedir(p);
-                if (!pblist)
-                        error = -ENOMEM;
-                /* Allocate memory for the image and read the data from swap */
-                if (!error)
-                        error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
-                if (!error) {
-                        release_eaten_pages();
-                        error = load_image_data(pblist, &handle, nr_pages);
-                }
-                if (!error)
-                        *pblist_ptr = pblist;
-        }
-        release_swap_map_reader(&handle);
-        blkdev_put(resume_bdev);
-        if (!error)
-                pr_debug("swsusp: Reading resume file was successful\n");
-        else
-                pr_debug("swsusp: Error %d resuming\n", error);
-        return error;
-}
-/**
- *      swsusp_check - Check for swsusp signature in the resume device
- */
-int swsusp_check(void)
-{
-        int error;
-        resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
-        if (!IS_ERR(resume_bdev)) {
-                set_blocksize(resume_bdev, PAGE_SIZE);
-                memset(&swsusp_header, 0, sizeof(swsusp_header));
-                if ((error = bio_read_page(0, &swsusp_header)))
-                        return error;
-                if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
-                        memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
-                        /* Reset swap signature now */
-                        error = bio_write_page(0, &swsusp_header);
-                } else {
-                        return -EINVAL;
-                }
-                if (error)
-                        blkdev_put(resume_bdev);
-                else
-                        pr_debug("swsusp: Signature found, resuming\n");
-        } else {
-                error = PTR_ERR(resume_bdev);
-        }
-        if (error)
-                pr_debug("swsusp: Error %d check for resume file\n", error);
-        return error;
-}
-/**
- *      swsusp_close - close swap device.
- */
-void swsusp_close(void)
-{
-        if (IS_ERR(resume_bdev)) {
-                pr_debug("swsusp: block device not initialised\n");
-                return;
-        }
-        blkdev_put(resume_bdev);
-}
diff --git a/kernel/power/user.c b/kernel/power/user.c
new file mode 100644
index 0000000000..3f1539fbe4
--- /dev/null
+++ b/kernel/power/user.c
@@ -0,0 +1,333 @@
+/*
+ * linux/kernel/power/user.c
+ *
+ * This file provides the user space interface for software suspend/resume.
+ *
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+#include "power.h"
+#define SNAPSHOT_MINOR  231
+static struct snapshot_data {
+        struct snapshot_handle handle;
+        int swap;
+        struct bitmap_page *bitmap;
+        int mode;
+        char frozen;
+        char ready;
+} snapshot_state;
+static atomic_t device_available = ATOMIC_INIT(1);
+static int snapshot_open(struct inode *inode, struct file *filp)
+{
+        struct snapshot_data *data;
+        if (!atomic_add_unless(&device_available, -1, 0))
+                return -EBUSY;
+        if ((filp->f_flags & O_ACCMODE) == O_RDWR)
+                return -ENOSYS;
+        nonseekable_open(inode, filp);
+        data = &snapshot_state;
+        filp->private_data = data;
+        memset(&data->handle, 0, sizeof(struct snapshot_handle));
+        if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+                data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
+                data->mode = O_RDONLY;
+        } else {
+                data->swap = -1;
+                data->mode = O_WRONLY;
+        }
+        data->bitmap = NULL;
+        data->frozen = 0;
+        data->ready = 0;
+        return 0;
+}
+static int snapshot_release(struct inode *inode, struct file *filp)
+{
+        struct snapshot_data *data;
+        swsusp_free();
+        data = filp->private_data;
+        free_all_swap_pages(data->swap, data->bitmap);
+        free_bitmap(data->bitmap);
+        if (data->frozen) {
+                down(&pm_sem);
+                thaw_processes();
+                enable_nonboot_cpus();
+                up(&pm_sem);
+        }
+        atomic_inc(&device_available);
+        return 0;
+}
+static ssize_t snapshot_read(struct file *filp, char __user *buf,
+                             size_t count, loff_t *offp)
+{
+        struct snapshot_data *data;
+        ssize_t res;
+        data = filp->private_data;
+        res = snapshot_read_next(&data->handle, count);
+        if (res > 0) {
+                if (copy_to_user(buf, data_of(data->handle), res))
+                        res = -EFAULT;
+                else
+                        *offp = data->handle.offset;
+        }
+        return res;
+}
+static ssize_t snapshot_write(struct file *filp, const char __user *buf,
+                              size_t count, loff_t *offp)
+{
+        struct snapshot_data *data;
+        ssize_t res;
+        data = filp->private_data;
+        res = snapshot_write_next(&data->handle, count);
+        if (res > 0) {
+                if (copy_from_user(data_of(data->handle), buf, res))
+                        res = -EFAULT;
+                else
+                        *offp = data->handle.offset;
+        }
+        return res;
+}
+static int snapshot_ioctl(struct inode *inode, struct file *filp,
+                          unsigned int cmd, unsigned long arg)
+{
+        int error = 0;
+        struct snapshot_data *data;
+        loff_t offset, avail;
+        if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
+                return -ENOTTY;
+        if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
+                return -ENOTTY;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        data = filp->private_data;
+        switch (cmd) {
+        case SNAPSHOT_FREEZE:
+                if (data->frozen)
+                        break;
+                down(&pm_sem);
+                disable_nonboot_cpus();
+                if (freeze_processes()) {
+                        thaw_processes();
+                        enable_nonboot_cpus();
+                        error = -EBUSY;
+                }
+                up(&pm_sem);
+                if (!error)
+                        data->frozen = 1;
+                break;
+        case SNAPSHOT_UNFREEZE:
+                if (!data->frozen)
+                        break;
+                down(&pm_sem);
+                thaw_processes();
+                enable_nonboot_cpus();
+                up(&pm_sem);
+                data->frozen = 0;
+                break;
+        case SNAPSHOT_ATOMIC_SNAPSHOT:
+                if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
+                        error = -EPERM;
+                        break;
+                }
+                down(&pm_sem);
+                /* Free memory before shutting down devices. */
+                error = swsusp_shrink_memory();
+                if (!error) {
+                        error = device_suspend(PMSG_FREEZE);
+                        if (!error) {
+                                in_suspend = 1;
+                                error = swsusp_suspend();
+                                device_resume();
+                        }
+                }
+                up(&pm_sem);
+                if (!error)
+                        error = put_user(in_suspend, (unsigned int __user *)arg);
+                if (!error)
+                        data->ready = 1;
+                break;
+        case SNAPSHOT_ATOMIC_RESTORE:
+                if (data->mode != O_WRONLY || !data->frozen ||
+                    !snapshot_image_loaded(&data->handle)) {
+                        error = -EPERM;
+                        break;
+                }
+                down(&pm_sem);
+                pm_prepare_console();
+                error = device_suspend(PMSG_FREEZE);
+                if (!error) {
+                        error = swsusp_resume();
+                        device_resume();
+                }
+                pm_restore_console();
+                up(&pm_sem);
+                break;
+        case SNAPSHOT_FREE:
+                swsusp_free();
+                memset(&data->handle, 0, sizeof(struct snapshot_handle));
+                data->ready = 0;
+                break;
+        case SNAPSHOT_SET_IMAGE_SIZE:
+                image_size = arg;
+                break;
+        case SNAPSHOT_AVAIL_SWAP:
+                avail = count_swap_pages(data->swap, 1);
+                avail <<= PAGE_SHIFT;
+                error = put_user(avail, (loff_t __user *)arg);
+                break;
+        case SNAPSHOT_GET_SWAP_PAGE:
+                if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+                        error = -ENODEV;
+                        break;
+                }
+                if (!data->bitmap) {
+                        data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
+                        if (!data->bitmap) {
+                                error = -ENOMEM;
+                                break;
+                        }
+                }
+                offset = alloc_swap_page(data->swap, data->bitmap);
+                if (offset) {
+                        offset <<= PAGE_SHIFT;
+                        error = put_user(offset, (loff_t __user *)arg);
+                } else {
+                        error = -ENOSPC;
+                }
+                break;
+        case SNAPSHOT_FREE_SWAP_PAGES:
+                if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+                        error = -ENODEV;
+                        break;
+                }
+                free_all_swap_pages(data->swap, data->bitmap);
+                free_bitmap(data->bitmap);
+                data->bitmap = NULL;
+                break;
+        case SNAPSHOT_SET_SWAP_FILE:
+                if (!data->bitmap) {
+                        /*
+                         * User space encodes device types as two-byte values,
+                         * so we need to recode them
+                         */
+                        if (old_decode_dev(arg)) {
+                                data->swap = swap_type_of(old_decode_dev(arg));
+                                if (data->swap < 0)
+                                        error = -ENODEV;
+                        } else {
+                                data->swap = -1;
+                                error = -EINVAL;
+                        }
+                } else {
+                        error = -EPERM;
+                }
+                break;
+        case SNAPSHOT_S2RAM:
+                if (!data->frozen) {
+                        error = -EPERM;
+                        break;
+                }
+                if (down_trylock(&pm_sem)) {
+                        error = -EBUSY;
+                        break;
+                }
+                if (pm_ops->prepare) {
+                        error = pm_ops->prepare(PM_SUSPEND_MEM);
+                        if (error)
+                                goto OutS3;
+                }
+                /* Put devices to sleep */
+                error = device_suspend(PMSG_SUSPEND);
+                if (error) {
+                        printk(KERN_ERR "Failed to suspend some devices.\n");
+                } else {
+                        /* Enter S3, system is already frozen */
+                        suspend_enter(PM_SUSPEND_MEM);
+                        /* Wake up devices */
+                        device_resume();
+                }
+                if (pm_ops->finish)
+                        pm_ops->finish(PM_SUSPEND_MEM);
+OutS3:
+                up(&pm_sem);
+                break;
+        default:
+                error = -ENOTTY;
+        }
+        return error;
+}
+static struct file_operations snapshot_fops = {
+        .open = snapshot_open,
+        .release = snapshot_release,
+        .read = snapshot_read,
+        .write = snapshot_write,
+        .llseek = no_llseek,
+        .ioctl = snapshot_ioctl,
+};
+static struct miscdevice snapshot_device = {
+        .minor = SNAPSHOT_MINOR,
+        .name = "snapshot",
+        .fops = &snapshot_fops,
+};
+static int __init snapshot_device_init(void)
+{
+        return misc_register(&snapshot_device);
+};
+device_initcall(snapshot_device_init);
diff --git a/kernel/printk.c b/kernel/printk.c
index 13ced0f782..c056f33244 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -122,44 +122,6 @@ static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
 static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */
-/*
- *      Setup a list of consoles. Called from init/main.c
- */
-static int __init console_setup(char *str)
-{
-        char name[sizeof(console_cmdline[0].name)];
-        char *s, *options;
-        int idx;
-        /*
-         *      Decode str into name, index, options.
-         */
-        if (str[0] >= '0' && str[0] <= '9') {
-                strcpy(name, "ttyS");
-                strncpy(name + 4, str, sizeof(name) - 5);
-        } else
-                strncpy(name, str, sizeof(name) - 1);
-        name[sizeof(name) - 1] = 0;
-        if ((options = strchr(str, ',')) != NULL)
-                *(options++) = 0;
-#ifdef __sparc__
-        if (!strcmp(str, "ttya"))
-                strcpy(name, "ttyS0");
-        if (!strcmp(str, "ttyb"))
-                strcpy(name, "ttyS1");
-#endif
-        for (s = name; *s; s++)
-                if ((*s >= '0' && *s <= '9') || *s == ',')
-                        break;
-        idx = simple_strtoul(s, NULL, 10);
-        *s = 0;
-        add_preferred_console(name, idx, options);
-        return 1;
-}
-__setup("console=", console_setup);
 static int __init log_buf_len_setup(char *str)
 {
        unsigned long size = memparse(str, &str);
@@ -398,8 +360,7 @@ static void call_console_drivers(unsigned long start, unsigned long end)
        unsigned long cur_index, start_print;
        static int msg_level = -1;
-        if (((long)(start - end)) > 0)
+        BUG_ON(((long)(start - end)) > 0);
-                BUG();
        cur_index = start;
        start_print = start;
@@ -659,6 +620,44 @@ static void call_console_drivers(unsigned long start, unsigned long end)
 #endif
+/*
+ * Set up a list of consoles.  Called from init/main.c
+ */
+static int __init console_setup(char *str)
+{
+        char name[sizeof(console_cmdline[0].name)];
+        char *s, *options;
+        int idx;
+        /*
+         * Decode str into name, index, options.
+         */
+        if (str[0] >= '0' && str[0] <= '9') {
+                strcpy(name, "ttyS");
+                strncpy(name + 4, str, sizeof(name) - 5);
+        } else {
+                strncpy(name, str, sizeof(name) - 1);
+        }
+        name[sizeof(name) - 1] = 0;
+        if ((options = strchr(str, ',')) != NULL)
+                *(options++) = 0;
+#ifdef __sparc__
+        if (!strcmp(str, "ttya"))
+                strcpy(name, "ttyS0");
+        if (!strcmp(str, "ttyb"))
+                strcpy(name, "ttyS1");
+#endif
+        for (s = name; *s; s++)
+                if ((*s >= '0' && *s <= '9') || *s == ',')
+                        break;
+        idx = simple_strtoul(s, NULL, 10);
+        *s = 0;
+        add_preferred_console(name, idx, options);
+        return 1;
+}
+__setup("console=", console_setup);
 /**
 * add_preferred_console - add a device to the list of preferred consoles.
 * @name: device name
@@ -708,8 +707,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
 */
 void acquire_console_sem(void)
 {
-        if (in_interrupt())
+        BUG_ON(in_interrupt());
-                BUG();
        down(&console_sem);
        console_locked = 1;
        console_may_schedule = 1;
diff --git a/kernel/profile.c b/kernel/profile.c
index f89248e6d7..68afe121e5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -23,6 +23,7 @@
 #include <linux/cpu.h>
 #include <linux/profile.h>
 #include <linux/highmem.h>
+#include <linux/mutex.h>
 #include <asm/sections.h>
 #include <asm/semaphore.h>
@@ -44,7 +45,7 @@ static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
 static DEFINE_PER_CPU(int, cpu_profile_flip);
-static DECLARE_MUTEX(profile_flip_mutex);
+static DEFINE_MUTEX(profile_flip_mutex);
 #endif /* CONFIG_SMP */
 static int __init profile_setup(char * str)
@@ -86,72 +87,52 @@ void __init profile_init(void)
 
 #ifdef CONFIG_PROFILING
 
-static DECLARE_RWSEM(profile_rwsem);
+static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
-static DEFINE_RWLOCK(handoff_lock);
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
-static struct notifier_block * task_exit_notifier;
+static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
-static struct notifier_block * task_free_notifier;
-static struct notifier_block * munmap_notifier;
 
 void profile_task_exit(struct task_struct * task)
 {
-        down_read(&profile_rwsem);
+        blocking_notifier_call_chain(&task_exit_notifier, 0, task);
-        notifier_call_chain(&task_exit_notifier, 0, task);
-        up_read(&profile_rwsem);
 }
 
 int profile_handoff_task(struct task_struct * task)
 {
        int ret;
-        read_lock(&handoff_lock);
+        ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
-        ret = notifier_call_chain(&task_free_notifier, 0, task);
-        read_unlock(&handoff_lock);
        return (ret == NOTIFY_OK) ? 1 : 0;
 }
 void profile_munmap(unsigned long addr)
 {
-        down_read(&profile_rwsem);
+        blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
-        notifier_call_chain(&munmap_notifier, 0, (void *)addr);
-        up_read(&profile_rwsem);
 }
 int task_handoff_register(struct notifier_block * n)
 {
-        int err = -EINVAL;
+        return atomic_notifier_chain_register(&task_free_notifier, n);
-        write_lock(&handoff_lock);
-        err = notifier_chain_register(&task_free_notifier, n);
-        write_unlock(&handoff_lock);
-        return err;
 }
 int task_handoff_unregister(struct notifier_block * n)
 {
-        int err = -EINVAL;
+        return atomic_notifier_chain_unregister(&task_free_notifier, n);
-        write_lock(&handoff_lock);
-        err = notifier_chain_unregister(&task_free_notifier, n);
-        write_unlock(&handoff_lock);
-        return err;
 }
 int profile_event_register(enum profile_type type, struct notifier_block * n)
 {
        int err = -EINVAL;
 
-        down_write(&profile_rwsem);
- 
        switch (type) {
                case PROFILE_TASK_EXIT:
-                        err = notifier_chain_register(&task_exit_notifier, n);
+                        err = blocking_notifier_chain_register(
+                                        &task_exit_notifier, n);
                        break;
                case PROFILE_MUNMAP:
-                        err = notifier_chain_register(&munmap_notifier, n);
+                        err = blocking_notifier_chain_register(
+                                        &munmap_notifier, n);
                        break;
        }
 
-        up_write(&profile_rwsem);
- 
        return err;
 }
@@ -160,18 +141,17 @@ int profile_event_unregister(enum profile_type type, struct notifier_block * n)
 {
        int err = -EINVAL;
 
-        down_write(&profile_rwsem);
- 
        switch (type) {
                case PROFILE_TASK_EXIT:
-                        err = notifier_chain_unregister(&task_exit_notifier, n);
+                        err = blocking_notifier_chain_unregister(
+                                        &task_exit_notifier, n);
                        break;
                case PROFILE_MUNMAP:
-                        err = notifier_chain_unregister(&munmap_notifier, n);
+                        err = blocking_notifier_chain_unregister(
+                                        &munmap_notifier, n);
                        break;
        }
-        up_write(&profile_rwsem);
        return err;
 }
@@ -243,7 +223,7 @@ static void profile_flip_buffers(void)
 {
        int i, j, cpu;
-        down(&profile_flip_mutex);
+        mutex_lock(&profile_flip_mutex);
        j = per_cpu(cpu_profile_flip, get_cpu());
        put_cpu();
        on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -259,14 +239,14 @@ static void profile_flip_buffers(void)
                        hits[i].hits = hits[i].pc = 0;
                }
        }
-        up(&profile_flip_mutex);
+        mutex_unlock(&profile_flip_mutex);
 }
 static void profile_discard_flip_buffers(void)
 {
        int i, cpu;
-        down(&profile_flip_mutex);
+        mutex_lock(&profile_flip_mutex);
        i = per_cpu(cpu_profile_flip, get_cpu());
        put_cpu();
        on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -274,7 +254,7 @@ static void profile_discard_flip_buffers(void)
                struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
                memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
        }
-        up(&profile_flip_mutex);
+        mutex_unlock(&profile_flip_mutex);
 }
 void profile_hit(int type, void *__pc)
@@ -319,7 +299,7 @@ out:
 }
 #ifdef CONFIG_HOTPLUG_CPU
-static int __devinit profile_cpu_callback(struct notifier_block *info,
+static int profile_cpu_callback(struct notifier_block *info,
                                        unsigned long action, void *__cpu)
 {
        int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 5f33cdb6ff..921c22ad16 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -30,14 +30,13 @@
 */
 void __ptrace_link(task_t *child, task_t *new_parent)
 {
-        if (!list_empty(&child->ptrace_list))
+        BUG_ON(!list_empty(&child->ptrace_list));
-                BUG();
        if (child->parent == new_parent)
                return;
        list_add(&child->ptrace_list, &child->parent->ptrace_children);
-        REMOVE_LINKS(child);
+        remove_parent(child);
        child->parent = new_parent;
-        SET_LINKS(child);
+        add_parent(child);
 }
 
 /*
@@ -57,10 +56,6 @@ void ptrace_untrace(task_t *child)
                        signal_wake_up(child, 1);
                }
        }
-        if (child->signal->flags & SIGNAL_GROUP_EXIT) {
-                sigaddset(&child->pending.signal, SIGKILL);
-                signal_wake_up(child, 1);
-        }
        spin_unlock(&child->sighand->siglock);
 }
@@ -72,17 +67,18 @@ void ptrace_untrace(task_t *child)
 */
 void __ptrace_unlink(task_t *child)
 {
-        if (!child->ptrace)
+        BUG_ON(!child->ptrace);
-                BUG();
        child->ptrace = 0;
        if (!list_empty(&child->ptrace_list)) {
                list_del_init(&child->ptrace_list);
-                REMOVE_LINKS(child);
+                remove_parent(child);
                child->parent = child->real_parent;
-                SET_LINKS(child);
+                add_parent(child);
        }
-        ptrace_untrace(child);
+        if (child->state == TASK_TRACED)
+                ptrace_untrace(child);
 }
 /*
@@ -152,12 +148,34 @@ int ptrace_may_attach(struct task_struct *task)
 int ptrace_attach(struct task_struct *task)
 {
        int retval;
-        task_lock(task);
        retval = -EPERM;
        if (task->pid <= 1)
-                goto bad;
+                goto out;
        if (task->tgid == current->tgid)
-                goto bad;
+                goto out;
+repeat:
+        /*
+         * Nasty, nasty.
+         *
+         * We want to hold both the task-lock and the
+         * tasklist_lock for writing at the same time.
+         * But that's against the rules (tasklist_lock
+         * is taken for reading by interrupts on other
+         * cpu's that may have task_lock).
+         */
+        task_lock(task);
+        local_irq_disable();
+        if (!write_trylock(&tasklist_lock)) {
+                local_irq_enable();
+                task_unlock(task);
+                do {
+                        cpu_relax();
+                } while (!write_can_lock(&tasklist_lock));
+                goto repeat;
+        }
        /* the same process cannot be attached many times */
        if (task->ptrace & PT_PTRACED)
                goto bad;
@@ -170,36 +188,39 @@ int ptrace_attach(struct task_struct *task)
                                      ? PT_ATTACHED : 0);
        if (capable(CAP_SYS_PTRACE))
                task->ptrace |= PT_PTRACE_CAP;
-        task_unlock(task);
-        write_lock_irq(&tasklist_lock);
        __ptrace_link(task, current);
-        write_unlock_irq(&tasklist_lock);
        force_sig_specific(SIGSTOP, task);
-        return 0;
 bad:
+        write_unlock_irq(&tasklist_lock);
        task_unlock(task);
+out:
        return retval;
 }
+void __ptrace_detach(struct task_struct *child, unsigned int data)
+{
+        child->exit_code = data;
+        /* .. re-parent .. */
+        __ptrace_unlink(child);
+        /* .. and wake it up. */
+        if (child->exit_state != EXIT_ZOMBIE)
+                wake_up_process(child);
+}
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
        if (!valid_signal(data))
-                return  -EIO;
+                return -EIO;
        /* Architecture-specific hardware disable .. */
        ptrace_disable(child);
-        /* .. re-parent .. */
-        child->exit_code = data;
        write_lock_irq(&tasklist_lock);
-        __ptrace_unlink(child);
+        if (child->ptrace)
-        /* .. and wake it up. */
+                __ptrace_detach(child, data);
-        if (child->exit_state != EXIT_ZOMBIE)
-                wake_up_process(child);
        write_unlock_irq(&tasklist_lock);
        return 0;
@@ -242,8 +263,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
                if (write) {
                        copy_to_user_page(vma, page, addr,
                                          maddr + offset, buf, bytes);
-                        if (!PageCompound(page))
+                        set_page_dirty_lock(page);
-                                set_page_dirty_lock(page);
                } else {
                        copy_from_user_page(vma, page, addr,
                                            buf, maddr + offset, bytes);
@@ -417,21 +437,22 @@ int ptrace_request(struct task_struct *child, long request,
 */
 int ptrace_traceme(void)
 {
-        int ret;
+        int ret = -EPERM;
        /*
         * Are we already being traced?
         */
-        if (current->ptrace & PT_PTRACED)
+        task_lock(current);
-                return -EPERM;
+        if (!(current->ptrace & PT_PTRACED)) {
-        ret = security_ptrace(current->parent, current);
+                ret = security_ptrace(current->parent, current);
-        if (ret)
+                /*
-                return -EPERM;
+                 * Set the ptrace bit in the process ptrace flags.
-        /*
+                 */
-         * Set the ptrace bit in the process ptrace flags.
+                if (!ret)
-         */
+                        current->ptrace |= PT_PTRACED;
-        current->ptrace |= PT_PTRACED;
+        }
-        return 0;
+        task_unlock(current);
+        return ret;
 }
 /**
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 0cf8146bd5..2058f88c7b 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -47,15 +47,16 @@
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
+#include <linux/mutex.h>
 /* Definition for rcupdate control block. */
-struct rcu_ctrlblk rcu_ctrlblk = {
+static struct rcu_ctrlblk rcu_ctrlblk = {
        .cur = -300,
        .completed = -300,
        .lock = SPIN_LOCK_UNLOCKED,
        .cpumask = CPU_MASK_NONE,
 };
-struct rcu_ctrlblk rcu_bh_ctrlblk = {
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
        .cur = -300,
        .completed = -300,
        .lock = SPIN_LOCK_UNLOCKED,
@@ -67,7 +68,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
-static int maxbatch = 10000;
+static int blimit = 10;
+static int qhimark = 10000;
+static int qlowmark = 100;
+#ifdef CONFIG_SMP
+static int rsinterval = 1000;
+#endif
+static atomic_t rcu_barrier_cpu_count;
+static DEFINE_MUTEX(rcu_barrier_mutex);
+static struct completion rcu_barrier_completion;
+#ifdef CONFIG_SMP
+static void force_quiescent_state(struct rcu_data *rdp,
+                        struct rcu_ctrlblk *rcp)
+{
+        int cpu;
+        cpumask_t cpumask;
+        set_need_resched();
+        if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
+                rdp->last_rs_qlen = rdp->qlen;
+                /*
+                 * Don't send IPI to itself. With irqs disabled,
+                 * rdp->cpu is the current cpu.
+                 */
+                cpumask = rcp->cpumask;
+                cpu_clear(rdp->cpu, cpumask);
+                for_each_cpu_mask(cpu, cpumask)
+                        smp_send_reschedule(cpu);
+        }
+}
+#else
+static inline void force_quiescent_state(struct rcu_data *rdp,
+                        struct rcu_ctrlblk *rcp)
+{
+        set_need_resched();
+}
+#endif
 /**
 * call_rcu - Queue an RCU callback for invocation after a grace period.
@@ -92,17 +129,13 @@ void fastcall call_rcu(struct rcu_head *head,
        rdp = &__get_cpu_var(rcu_data);
        *rdp->nxttail = head;
        rdp->nxttail = &head->next;
+        if (unlikely(++rdp->qlen > qhimark)) {
-        if (unlikely(++rdp->count > 10000))
+                rdp->blimit = INT_MAX;
-                set_need_resched();
+                force_quiescent_state(rdp, &rcu_ctrlblk);
+        }
        local_irq_restore(flags);
 }
-static atomic_t rcu_barrier_cpu_count;
-static struct semaphore rcu_barrier_sema;
-static struct completion rcu_barrier_completion;
 /**
 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
 * @head: structure to be used for queueing the RCU updates.
@@ -131,12 +164,12 @@ void fastcall call_rcu_bh(struct rcu_head *head,
        rdp = &__get_cpu_var(rcu_bh_data);
        *rdp->nxttail = head;
        rdp->nxttail = &head->next;
-        rdp->count++;
-/*
+        if (unlikely(++rdp->qlen > qhimark)) {
- *  Should we directly call rcu_do_batch() here ?
+                rdp->blimit = INT_MAX;
- *  if (unlikely(rdp->count > 10000))
+                force_quiescent_state(rdp, &rcu_bh_ctrlblk);
- *      rcu_do_batch(rdp);
+        }
- */
        local_irq_restore(flags);
 }
@@ -175,13 +208,13 @@ static void rcu_barrier_func(void *notused)
 void rcu_barrier(void)
 {
        BUG_ON(in_interrupt());
-        /* Take cpucontrol semaphore to protect against CPU hotplug */
+        /* Take cpucontrol mutex to protect against CPU hotplug */
-        down(&rcu_barrier_sema);
+        mutex_lock(&rcu_barrier_mutex);
        init_completion(&rcu_barrier_completion);
        atomic_set(&rcu_barrier_cpu_count, 0);
        on_each_cpu(rcu_barrier_func, NULL, 0, 1);
        wait_for_completion(&rcu_barrier_completion);
-        up(&rcu_barrier_sema);
+        mutex_unlock(&rcu_barrier_mutex);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
@@ -199,10 +232,12 @@ static void rcu_do_batch(struct rcu_data *rdp)
                next = rdp->donelist = list->next;
                list->func(list);
                list = next;
-                rdp->count--;
+                rdp->qlen--;
-                if (++count >= maxbatch)
+                if (++count >= rdp->blimit)
                        break;
        }
+        if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+                rdp->blimit = blimit;
        if (!rdp->donelist)
                rdp->donetail = &rdp->donelist;
        else
@@ -381,8 +416,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
                rdp->curtail = &rdp->curlist;
        }
-        local_irq_disable();
        if (rdp->nxtlist && !rdp->curlist) {
+                local_irq_disable();
                rdp->curlist = rdp->nxtlist;
                rdp->curtail = rdp->nxttail;
                rdp->nxtlist = NULL;
@@ -407,9 +442,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
                        rcu_start_batch(rcp);
                        spin_unlock(&rcp->lock);
                }
-        } else {
-                local_irq_enable();
        }
        rcu_check_quiescent_state(rcp, rdp);
        if (rdp->donelist)
                rcu_do_batch(rdp);
@@ -445,12 +479,31 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
        return 0;
 }
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
 int rcu_pending(int cpu)
 {
        return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
                __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
 }
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+        struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+        struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
+        return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+}
 void rcu_check_callbacks(int cpu, int user)
 {
        if (user || 
@@ -473,6 +526,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
        rdp->quiescbatch = rcp->completed;
        rdp->qs_pending = 0;
        rdp->cpu = cpu;
+        rdp->blimit = blimit;
 }
 static void __devinit rcu_online_cpu(int cpu)
@@ -485,7 +539,7 @@ static void __devinit rcu_online_cpu(int cpu)
        tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
 }
-static int __devinit rcu_cpu_notify(struct notifier_block *self, 
+static int rcu_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -502,7 +556,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata rcu_nb = {
+static struct notifier_block rcu_nb = {
        .notifier_call  = rcu_cpu_notify,
 };
@@ -514,7 +568,6 @@ static struct notifier_block __devinitdata rcu_nb = {
 */
 void __init rcu_init(void)
 {
-        sema_init(&rcu_barrier_sema, 1);
        rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
                        (void *)(long)smp_processor_id());
        /* Register notifier for non-boot CPUs */
@@ -567,9 +620,14 @@ void synchronize_kernel(void)
        synchronize_rcu();
 }
-module_param(maxbatch, int, 0);
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
+#ifdef CONFIG_SMP
+module_param(rsinterval, int, 0);
+#endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL_FUTURE(call_rcu);     /* WARNING: GPL-only in April 2006. */
-EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(synchronize_rcu);
-EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7712912dbc..8154e7589d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -54,15 +54,15 @@ static int verbose;		/* Print more debug info. */
 static int test_no_idle_hz;     /* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
-MODULE_PARM(nreaders, "i");
+module_param(nreaders, int, 0);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
-MODULE_PARM(stat_interval, "i");
+module_param(stat_interval, int, 0);
 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
-MODULE_PARM(verbose, "i");
+module_param(verbose, bool, 0);
 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
-MODULE_PARM(test_no_idle_hz, "i");
+module_param(test_no_idle_hz, bool, 0);
 MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
-MODULE_PARM(shuffle_interval, "i");
+module_param(shuffle_interval, int, 0);
 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
 #define TORTURE_FLAG "rcutorture: "
 #define PRINTK_STRING(s) \
@@ -301,7 +301,7 @@ rcu_torture_printk(char *page)
        long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
        long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
-        for_each_cpu(cpu) {
+        for_each_possible_cpu(cpu) {
                for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
                        pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
                        batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
@@ -441,6 +441,16 @@ rcu_torture_shuffle(void *arg)
        return 0;
 }
+static inline void
+rcu_torture_print_module_parms(char *tag)
+{
+        printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d "
+                "stat_interval=%d verbose=%d test_no_idle_hz=%d "
+                "shuffle_interval = %d\n",
+                tag, nrealreaders, stat_interval, verbose, test_no_idle_hz,
+                shuffle_interval);
+}
 static void
 rcu_torture_cleanup(void)
 {
@@ -483,9 +493,10 @@ rcu_torture_cleanup(void)
        rcu_barrier();
        rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
-        printk(KERN_ALERT TORTURE_FLAG
+        if (atomic_read(&n_rcu_torture_error))
-               "--- End of test: %s\n",
+                rcu_torture_print_module_parms("End of test: FAILURE");
-               atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE");
+        else
+                rcu_torture_print_module_parms("End of test: SUCCESS");
 }
 static int
@@ -501,11 +512,7 @@ rcu_torture_init(void)
                nrealreaders = nreaders;
        else
                nrealreaders = 2 * num_online_cpus();
-        printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d "
+        rcu_torture_print_module_parms("Start of test");
-                "stat_interval=%d verbose=%d test_no_idle_hz=%d "
-                "shuffle_interval = %d\n",
-                nrealreaders, stat_interval, verbose, test_no_idle_hz,
-                shuffle_interval);
        fullstop = 0;
        /* Set up the freelist. */
@@ -528,7 +535,7 @@ rcu_torture_init(void)
        atomic_set(&n_rcu_torture_error, 0);
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
                atomic_set(&rcu_torture_wcount[i], 0);
-        for_each_cpu(cpu) {
+        for_each_possible_cpu(cpu) {
                for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
                        per_cpu(rcu_torture_count, cpu)[i] = 0;
                        per_cpu(rcu_torture_batch, cpu)[i] = 0;
diff --git a/kernel/relay.c b/kernel/relay.c
new file mode 100644
index 0000000000..33345e7348
--- /dev/null
+++ b/kernel/relay.c
@@ -0,0 +1,1012 @@
+/*
+ * Public API and common code for kernel->userspace relay file support.
+ *
+ * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
+ *
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ *
+ * Moved to kernel/relay.c by Paul Mundt, 2006.
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/relay.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+/*
+ * close() vm_op implementation for relay file mapping.
+ */
+static void relay_file_mmap_close(struct vm_area_struct *vma)
+{
+        struct rchan_buf *buf = vma->vm_private_data;
+        buf->chan->cb->buf_unmapped(buf, vma->vm_file);
+}
+/*
+ * nopage() vm_op implementation for relay file mapping.
+ */
+static struct page *relay_buf_nopage(struct vm_area_struct *vma,
+                                     unsigned long address,
+                                     int *type)
+{
+        struct page *page;
+        struct rchan_buf *buf = vma->vm_private_data;
+        unsigned long offset = address - vma->vm_start;
+        if (address > vma->vm_end)
+                return NOPAGE_SIGBUS; /* Disallow mremap */
+        if (!buf)
+                return NOPAGE_OOM;
+        page = vmalloc_to_page(buf->start + offset);
+        if (!page)
+                return NOPAGE_OOM;
+        get_page(page);
+        if (type)
+                *type = VM_FAULT_MINOR;
+        return page;
+}
+/*
+ * vm_ops for relay file mappings.
+ */
+static struct vm_operations_struct relay_file_mmap_ops = {
+        .nopage = relay_buf_nopage,
+        .close = relay_file_mmap_close,
+};
+/**
+ *      relay_mmap_buf: - mmap channel buffer to process address space
+ *      @buf: relay channel buffer
+ *      @vma: vm_area_struct describing memory to be mapped
+ *
+ *      Returns 0 if ok, negative on error
+ *
+ *      Caller should already have grabbed mmap_sem.
+ */
+int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+{
+        unsigned long length = vma->vm_end - vma->vm_start;
+        struct file *filp = vma->vm_file;
+        if (!buf)
+                return -EBADF;
+        if (length != (unsigned long)buf->chan->alloc_size)
+                return -EINVAL;
+        vma->vm_ops = &relay_file_mmap_ops;
+        vma->vm_private_data = buf;
+        buf->chan->cb->buf_mapped(buf, filp);
+        return 0;
+}
+/**
+ *      relay_alloc_buf - allocate a channel buffer
+ *      @buf: the buffer struct
+ *      @size: total size of the buffer
+ *
+ *      Returns a pointer to the resulting buffer, NULL if unsuccessful. The
+ *      passed in size will get page aligned, if it isn't already.
+ */
+static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
+{
+        void *mem;
+        unsigned int i, j, n_pages;
+        *size = PAGE_ALIGN(*size);
+        n_pages = *size >> PAGE_SHIFT;
+        buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
+        if (!buf->page_array)
+                return NULL;
+        for (i = 0; i < n_pages; i++) {
+                buf->page_array[i] = alloc_page(GFP_KERNEL);
+                if (unlikely(!buf->page_array[i]))
+                        goto depopulate;
+        }
+        mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
+        if (!mem)
+                goto depopulate;
+        memset(mem, 0, *size);
+        buf->page_count = n_pages;
+        return mem;
+depopulate:
+        for (j = 0; j < i; j++)
+                __free_page(buf->page_array[j]);
+        kfree(buf->page_array);
+        return NULL;
+}
+/**
+ *      relay_create_buf - allocate and initialize a channel buffer
+ *      @alloc_size: size of the buffer to allocate
+ *      @n_subbufs: number of sub-buffers in the channel
+ *
+ *      Returns channel buffer if successful, NULL otherwise
+ */
+struct rchan_buf *relay_create_buf(struct rchan *chan)
+{
+        struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
+        if (!buf)
+                return NULL;
+        buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
+        if (!buf->padding)
+                goto free_buf;
+        buf->start = relay_alloc_buf(buf, &chan->alloc_size);
+        if (!buf->start)
+                goto free_buf;
+        buf->chan = chan;
+        kref_get(&buf->chan->kref);
+        return buf;
+free_buf:
+        kfree(buf->padding);
+        kfree(buf);
+        return NULL;
+}
+/**
+ *      relay_destroy_channel - free the channel struct
+ *
+ *      Should only be called from kref_put().
+ */
+void relay_destroy_channel(struct kref *kref)
+{
+        struct rchan *chan = container_of(kref, struct rchan, kref);
+        kfree(chan);
+}
+/**
+ *      relay_destroy_buf - destroy an rchan_buf struct and associated buffer
+ *      @buf: the buffer struct
+ */
+void relay_destroy_buf(struct rchan_buf *buf)
+{
+        struct rchan *chan = buf->chan;
+        unsigned int i;
+        if (likely(buf->start)) {
+                vunmap(buf->start);
+                for (i = 0; i < buf->page_count; i++)
+                        __free_page(buf->page_array[i]);
+                kfree(buf->page_array);
+        }
+        kfree(buf->padding);
+        kfree(buf);
+        kref_put(&chan->kref, relay_destroy_channel);
+}
+/**
+ *      relay_remove_buf - remove a channel buffer
+ *
+ *      Removes the file from the fileystem, which also frees the
+ *      rchan_buf_struct and the channel buffer.  Should only be called from
+ *      kref_put().
+ */
+void relay_remove_buf(struct kref *kref)
+{
+        struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
+        buf->chan->cb->remove_buf_file(buf->dentry);
+        relay_destroy_buf(buf);
+}
+/**
+ *      relay_buf_empty - boolean, is the channel buffer empty?
+ *      @buf: channel buffer
+ *
+ *      Returns 1 if the buffer is empty, 0 otherwise.
+ */
+int relay_buf_empty(struct rchan_buf *buf)
+{
+        return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
+}
+EXPORT_SYMBOL_GPL(relay_buf_empty);
+/**
+ *      relay_buf_full - boolean, is the channel buffer full?
+ *      @buf: channel buffer
+ *
+ *      Returns 1 if the buffer is full, 0 otherwise.
+ */
+int relay_buf_full(struct rchan_buf *buf)
+{
+        size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
+        return (ready >= buf->chan->n_subbufs) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(relay_buf_full);
+/*
+ * High-level relay kernel API and associated functions.
+ */
+/*
+ * rchan_callback implementations defining default channel behavior.  Used
+ * in place of corresponding NULL values in client callback struct.
+ */
+/*
+ * subbuf_start() default callback.  Does nothing.
+ */
+static int subbuf_start_default_callback (struct rchan_buf *buf,
+                                          void *subbuf,
+                                          void *prev_subbuf,
+                                          size_t prev_padding)
+{
+        if (relay_buf_full(buf))
+                return 0;
+        return 1;
+}
+/*
+ * buf_mapped() default callback.  Does nothing.
+ */
+static void buf_mapped_default_callback(struct rchan_buf *buf,
+                                        struct file *filp)
+{
+}
+/*
+ * buf_unmapped() default callback.  Does nothing.
+ */
+static void buf_unmapped_default_callback(struct rchan_buf *buf,
+                                          struct file *filp)
+{
+}
+/*
+ * create_buf_file_create() default callback.  Does nothing.
+ */
+static struct dentry *create_buf_file_default_callback(const char *filename,
+                                                       struct dentry *parent,
+                                                       int mode,
+                                                       struct rchan_buf *buf,
+                                                       int *is_global)
+{
+        return NULL;
+}
+/*
+ * remove_buf_file() default callback.  Does nothing.
+ */
+static int remove_buf_file_default_callback(struct dentry *dentry)
+{
+        return -EINVAL;
+}
+/* relay channel default callbacks */
+static struct rchan_callbacks default_channel_callbacks = {
+        .subbuf_start = subbuf_start_default_callback,
+        .buf_mapped = buf_mapped_default_callback,
+        .buf_unmapped = buf_unmapped_default_callback,
+        .create_buf_file = create_buf_file_default_callback,
+        .remove_buf_file = remove_buf_file_default_callback,
+};
+/**
+ *      wakeup_readers - wake up readers waiting on a channel
+ *      @private: the channel buffer
+ *
+ *      This is the work function used to defer reader waking.  The
+ *      reason waking is deferred is that calling directly from write
+ *      causes problems if you're writing from say the scheduler.
+ */
+static void wakeup_readers(void *private)
+{
+        struct rchan_buf *buf = private;
+        wake_up_interruptible(&buf->read_wait);
+}
+/**
+ *      __relay_reset - reset a channel buffer
+ *      @buf: the channel buffer
+ *      @init: 1 if this is a first-time initialization
+ *
+ *      See relay_reset for description of effect.
+ */
+static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
+{
+        size_t i;
+        if (init) {
+                init_waitqueue_head(&buf->read_wait);
+                kref_init(&buf->kref);
+                INIT_WORK(&buf->wake_readers, NULL, NULL);
+        } else {
+                cancel_delayed_work(&buf->wake_readers);
+                flush_scheduled_work();
+        }
+        buf->subbufs_produced = 0;
+        buf->subbufs_consumed = 0;
+        buf->bytes_consumed = 0;
+        buf->finalized = 0;
+        buf->data = buf->start;
+        buf->offset = 0;
+        for (i = 0; i < buf->chan->n_subbufs; i++)
+                buf->padding[i] = 0;
+        buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+}
+/**
+ *      relay_reset - reset the channel
+ *      @chan: the channel
+ *
+ *      This has the effect of erasing all data from all channel buffers
+ *      and restarting the channel in its initial state.  The buffers
+ *      are not freed, so any mappings are still in effect.
+ *
+ *      NOTE: Care should be taken that the channel isn't actually
+ *      being used by anything when this call is made.
+ */
+void relay_reset(struct rchan *chan)
+{
+        unsigned int i;
+        struct rchan_buf *prev = NULL;
+        if (!chan)
+                return;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!chan->buf[i] || chan->buf[i] == prev)
+                        break;
+                __relay_reset(chan->buf[i], 0);
+                prev = chan->buf[i];
+        }
+}
+EXPORT_SYMBOL_GPL(relay_reset);
+/**
+ *      relay_open_buf - create a new relay channel buffer
+ *
+ *      Internal - used by relay_open().
+ */
+static struct rchan_buf *relay_open_buf(struct rchan *chan,
+                                        const char *filename,
+                                        struct dentry *parent,
+                                        int *is_global)
+{
+        struct rchan_buf *buf;
+        struct dentry *dentry;
+        if (*is_global)
+                return chan->buf[0];
+        buf = relay_create_buf(chan);
+        if (!buf)
+                return NULL;
+        /* Create file in fs */
+        dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
+                                           buf, is_global);
+        if (!dentry) {
+                relay_destroy_buf(buf);
+                return NULL;
+        }
+        buf->dentry = dentry;
+        __relay_reset(buf, 1);
+        return buf;
+}
+/**
+ *      relay_close_buf - close a channel buffer
+ *      @buf: channel buffer
+ *
+ *      Marks the buffer finalized and restores the default callbacks.
+ *      The channel buffer and channel buffer data structure are then freed
+ *      automatically when the last reference is given up.
+ */
+static inline void relay_close_buf(struct rchan_buf *buf)
+{
+        buf->finalized = 1;
+        cancel_delayed_work(&buf->wake_readers);
+        flush_scheduled_work();
+        kref_put(&buf->kref, relay_remove_buf);
+}
+static inline void setup_callbacks(struct rchan *chan,
+                                   struct rchan_callbacks *cb)
+{
+        if (!cb) {
+                chan->cb = &default_channel_callbacks;
+                return;
+        }
+        if (!cb->subbuf_start)
+                cb->subbuf_start = subbuf_start_default_callback;
+        if (!cb->buf_mapped)
+                cb->buf_mapped = buf_mapped_default_callback;
+        if (!cb->buf_unmapped)
+                cb->buf_unmapped = buf_unmapped_default_callback;
+        if (!cb->create_buf_file)
+                cb->create_buf_file = create_buf_file_default_callback;
+        if (!cb->remove_buf_file)
+                cb->remove_buf_file = remove_buf_file_default_callback;
+        chan->cb = cb;
+}
+/**
+ *      relay_open - create a new relay channel
+ *      @base_filename: base name of files to create
+ *      @parent: dentry of parent directory, NULL for root directory
+ *      @subbuf_size: size of sub-buffers
+ *      @n_subbufs: number of sub-buffers
+ *      @cb: client callback functions
+ *
+ *      Returns channel pointer if successful, NULL otherwise.
+ *
+ *      Creates a channel buffer for each cpu using the sizes and
+ *      attributes specified.  The created channel buffer files
+ *      will be named base_filename0...base_filenameN-1.  File
+ *      permissions will be S_IRUSR.
+ */
+struct rchan *relay_open(const char *base_filename,
+                         struct dentry *parent,
+                         size_t subbuf_size,
+                         size_t n_subbufs,
+                         struct rchan_callbacks *cb)
+{
+        unsigned int i;
+        struct rchan *chan;
+        char *tmpname;
+        int is_global = 0;
+        if (!base_filename)
+                return NULL;
+        if (!(subbuf_size && n_subbufs))
+                return NULL;
+        chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
+        if (!chan)
+                return NULL;
+        chan->version = RELAYFS_CHANNEL_VERSION;
+        chan->n_subbufs = n_subbufs;
+        chan->subbuf_size = subbuf_size;
+        chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+        setup_callbacks(chan, cb);
+        kref_init(&chan->kref);
+        tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+        if (!tmpname)
+                goto free_chan;
+        for_each_online_cpu(i) {
+                sprintf(tmpname, "%s%d", base_filename, i);
+                chan->buf[i] = relay_open_buf(chan, tmpname, parent,
+                                              &is_global);
+                if (!chan->buf[i])
+                        goto free_bufs;
+                chan->buf[i]->cpu = i;
+        }
+        kfree(tmpname);
+        return chan;
+free_bufs:
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!chan->buf[i])
+                        break;
+                relay_close_buf(chan->buf[i]);
+                if (is_global)
+                        break;
+        }
+        kfree(tmpname);
+free_chan:
+        kref_put(&chan->kref, relay_destroy_channel);
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(relay_open);
+/**
+ *      relay_switch_subbuf - switch to a new sub-buffer
+ *      @buf: channel buffer
+ *      @length: size of current event
+ *
+ *      Returns either the length passed in or 0 if full.
+ *
+ *      Performs sub-buffer-switch tasks such as invoking callbacks,
+ *      updating padding counts, waking up readers, etc.
+ */
+size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
+{
+        void *old, *new;
+        size_t old_subbuf, new_subbuf;
+        if (unlikely(length > buf->chan->subbuf_size))
+                goto toobig;
+        if (buf->offset != buf->chan->subbuf_size + 1) {
+                buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+                old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+                buf->padding[old_subbuf] = buf->prev_padding;
+                buf->subbufs_produced++;
+                buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
+                        buf->padding[old_subbuf];
+                smp_mb();
+                if (waitqueue_active(&buf->read_wait)) {
+                        PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
+                        schedule_delayed_work(&buf->wake_readers, 1);
+                }
+        }
+        old = buf->data;
+        new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+        new = buf->start + new_subbuf * buf->chan->subbuf_size;
+        buf->offset = 0;
+        if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+                buf->offset = buf->chan->subbuf_size + 1;
+                return 0;
+        }
+        buf->data = new;
+        buf->padding[new_subbuf] = 0;
+        if (unlikely(length + buf->offset > buf->chan->subbuf_size))
+                goto toobig;
+        return length;
+toobig:
+        buf->chan->last_toobig = length;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(relay_switch_subbuf);
+/**
+ *      relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
+ *      @chan: the channel
+ *      @cpu: the cpu associated with the channel buffer to update
+ *      @subbufs_consumed: number of sub-buffers to add to current buf's count
+ *
+ *      Adds to the channel buffer's consumed sub-buffer count.
+ *      subbufs_consumed should be the number of sub-buffers newly consumed,
+ *      not the total consumed.
+ *
+ *      NOTE: kernel clients don't need to call this function if the channel
+ *      mode is 'overwrite'.
+ */
+void relay_subbufs_consumed(struct rchan *chan,
+                            unsigned int cpu,
+                            size_t subbufs_consumed)
+{
+        struct rchan_buf *buf;
+        if (!chan)
+                return;
+        if (cpu >= NR_CPUS || !chan->buf[cpu])
+                return;
+        buf = chan->buf[cpu];
+        buf->subbufs_consumed += subbufs_consumed;
+        if (buf->subbufs_consumed > buf->subbufs_produced)
+                buf->subbufs_consumed = buf->subbufs_produced;
+}
+EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
+/**
+ *      relay_close - close the channel
+ *      @chan: the channel
+ *
+ *      Closes all channel buffers and frees the channel.
+ */
+void relay_close(struct rchan *chan)
+{
+        unsigned int i;
+        struct rchan_buf *prev = NULL;
+        if (!chan)
+                return;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!chan->buf[i] || chan->buf[i] == prev)
+                        break;
+                relay_close_buf(chan->buf[i]);
+                prev = chan->buf[i];
+        }
+        if (chan->last_toobig)
+                printk(KERN_WARNING "relay: one or more items not logged "
+                       "[item size (%Zd) > sub-buffer size (%Zd)]\n",
+                       chan->last_toobig, chan->subbuf_size);
+        kref_put(&chan->kref, relay_destroy_channel);
+}
+EXPORT_SYMBOL_GPL(relay_close);
+/**
+ *      relay_flush - close the channel
+ *      @chan: the channel
+ *
+ *      Flushes all channel buffers i.e. forces buffer switch.
+ */
+void relay_flush(struct rchan *chan)
+{
+        unsigned int i;
+        struct rchan_buf *prev = NULL;
+        if (!chan)
+                return;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!chan->buf[i] || chan->buf[i] == prev)
+                        break;
+                relay_switch_subbuf(chan->buf[i], 0);
+                prev = chan->buf[i];
+        }
+}
+EXPORT_SYMBOL_GPL(relay_flush);
+/**
+ *      relay_file_open - open file op for relay files
+ *      @inode: the inode
+ *      @filp: the file
+ *
+ *      Increments the channel buffer refcount.
+ */
+static int relay_file_open(struct inode *inode, struct file *filp)
+{
+        struct rchan_buf *buf = inode->u.generic_ip;
+        kref_get(&buf->kref);
+        filp->private_data = buf;
+        return 0;
+}
+/**
+ *      relay_file_mmap - mmap file op for relay files
+ *      @filp: the file
+ *      @vma: the vma describing what to map
+ *
+ *      Calls upon relay_mmap_buf to map the file into user space.
+ */
+static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+        struct rchan_buf *buf = filp->private_data;
+        return relay_mmap_buf(buf, vma);
+}
+/**
+ *      relay_file_poll - poll file op for relay files
+ *      @filp: the file
+ *      @wait: poll table
+ *
+ *      Poll implemention.
+ */
+static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
+{
+        unsigned int mask = 0;
+        struct rchan_buf *buf = filp->private_data;
+        if (buf->finalized)
+                return POLLERR;
+        if (filp->f_mode & FMODE_READ) {
+                poll_wait(filp, &buf->read_wait, wait);
+                if (!relay_buf_empty(buf))
+                        mask |= POLLIN | POLLRDNORM;
+        }
+        return mask;
+}
+/**
+ *      relay_file_release - release file op for relay files
+ *      @inode: the inode
+ *      @filp: the file
+ *
+ *      Decrements the channel refcount, as the filesystem is
+ *      no longer using it.
+ */
+static int relay_file_release(struct inode *inode, struct file *filp)
+{
+        struct rchan_buf *buf = filp->private_data;
+        kref_put(&buf->kref, relay_remove_buf);
+        return 0;
+}
+/**
+ *      relay_file_read_consume - update the consumed count for the buffer
+ */
+static void relay_file_read_consume(struct rchan_buf *buf,
+                                    size_t read_pos,
+                                    size_t bytes_consumed)
+{
+        size_t subbuf_size = buf->chan->subbuf_size;
+        size_t n_subbufs = buf->chan->n_subbufs;
+        size_t read_subbuf;
+        if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
+                relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+                buf->bytes_consumed = 0;
+        }
+        buf->bytes_consumed += bytes_consumed;
+        read_subbuf = read_pos / buf->chan->subbuf_size;
+        if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
+                if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
+                    (buf->offset == subbuf_size))
+                        return;
+                relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+                buf->bytes_consumed = 0;
+        }
+}
+/**
+ *      relay_file_read_avail - boolean, are there unconsumed bytes available?
+ */
+static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
+{
+        size_t subbuf_size = buf->chan->subbuf_size;
+        size_t n_subbufs = buf->chan->n_subbufs;
+        size_t produced = buf->subbufs_produced;
+        size_t consumed = buf->subbufs_consumed;
+        relay_file_read_consume(buf, read_pos, 0);
+        if (unlikely(buf->offset > subbuf_size)) {
+                if (produced == consumed)
+                        return 0;
+                return 1;
+        }
+        if (unlikely(produced - consumed >= n_subbufs)) {
+                consumed = (produced / n_subbufs) * n_subbufs;
+                buf->subbufs_consumed = consumed;
+        }
+        
+        produced = (produced % n_subbufs) * subbuf_size + buf->offset;
+        consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
+        if (consumed > produced)
+                produced += n_subbufs * subbuf_size;
+        
+        if (consumed == produced)
+                return 0;
+        return 1;
+}
+/**
+ *      relay_file_read_subbuf_avail - return bytes available in sub-buffer
+ */
+static size_t relay_file_read_subbuf_avail(size_t read_pos,
+                                           struct rchan_buf *buf)
+{
+        size_t padding, avail = 0;
+        size_t read_subbuf, read_offset, write_subbuf, write_offset;
+        size_t subbuf_size = buf->chan->subbuf_size;
+        write_subbuf = (buf->data - buf->start) / subbuf_size;
+        write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+        read_subbuf = read_pos / subbuf_size;
+        read_offset = read_pos % subbuf_size;
+        padding = buf->padding[read_subbuf];
+        if (read_subbuf == write_subbuf) {
+                if (read_offset + padding < write_offset)
+                        avail = write_offset - (read_offset + padding);
+        } else
+                avail = (subbuf_size - padding) - read_offset;
+        return avail;
+}
+/**
+ *      relay_file_read_start_pos - find the first available byte to read
+ *
+ *      If the read_pos is in the middle of padding, return the
+ *      position of the first actually available byte, otherwise
+ *      return the original value.
+ */
+static size_t relay_file_read_start_pos(size_t read_pos,
+                                        struct rchan_buf *buf)
+{
+        size_t read_subbuf, padding, padding_start, padding_end;
+        size_t subbuf_size = buf->chan->subbuf_size;
+        size_t n_subbufs = buf->chan->n_subbufs;
+        read_subbuf = read_pos / subbuf_size;
+        padding = buf->padding[read_subbuf];
+        padding_start = (read_subbuf + 1) * subbuf_size - padding;
+        padding_end = (read_subbuf + 1) * subbuf_size;
+        if (read_pos >= padding_start && read_pos < padding_end) {
+                read_subbuf = (read_subbuf + 1) % n_subbufs;
+                read_pos = read_subbuf * subbuf_size;
+        }
+        return read_pos;
+}
+/**
+ *      relay_file_read_end_pos - return the new read position
+ */
+static size_t relay_file_read_end_pos(struct rchan_buf *buf,
+                                      size_t read_pos,
+                                      size_t count)
+{
+        size_t read_subbuf, padding, end_pos;
+        size_t subbuf_size = buf->chan->subbuf_size;
+        size_t n_subbufs = buf->chan->n_subbufs;
+        read_subbuf = read_pos / subbuf_size;
+        padding = buf->padding[read_subbuf];
+        if (read_pos % subbuf_size + count + padding == subbuf_size)
+                end_pos = (read_subbuf + 1) * subbuf_size;
+        else
+                end_pos = read_pos + count;
+        if (end_pos >= subbuf_size * n_subbufs)
+                end_pos = 0;
+        return end_pos;
+}
+/**
+ *      subbuf_read_actor - read up to one subbuf's worth of data
+ */
+static int subbuf_read_actor(size_t read_start,
+                             struct rchan_buf *buf,
+                             size_t avail,
+                             read_descriptor_t *desc,
+                             read_actor_t actor)
+{
+        void *from;
+        int ret = 0;
+        from = buf->start + read_start;
+        ret = avail;
+        if (copy_to_user(desc->arg.data, from, avail)) {
+                desc->error = -EFAULT;
+                ret = 0;
+        }
+        desc->arg.data += ret;
+        desc->written += ret;
+        desc->count -= ret;
+        return ret;
+}
+/**
+ *      subbuf_send_actor - send up to one subbuf's worth of data
+ */
+static int subbuf_send_actor(size_t read_start,
+                             struct rchan_buf *buf,
+                             size_t avail,
+                             read_descriptor_t *desc,
+                             read_actor_t actor)
+{
+        unsigned long pidx, poff;
+        unsigned int subbuf_pages;
+        int ret = 0;
+        subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT;
+        pidx = (read_start / PAGE_SIZE) % subbuf_pages;
+        poff = read_start & ~PAGE_MASK;
+        while (avail) {
+                struct page *p = buf->page_array[pidx];
+                unsigned int len;
+                len = PAGE_SIZE - poff;
+                if (len > avail)
+                        len = avail;
+                len = actor(desc, p, poff, len);
+                if (desc->error)
+                        break;
+                avail -= len;
+                ret += len;
+                poff = 0;
+                pidx = (pidx + 1) % subbuf_pages;
+        }
+        return ret;
+}
+typedef int (*subbuf_actor_t) (size_t read_start,
+                               struct rchan_buf *buf,
+                               size_t avail,
+                               read_descriptor_t *desc,
+                               read_actor_t actor);
+/**
+ *      relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
+ */
+static inline ssize_t relay_file_read_subbufs(struct file *filp,
+                                              loff_t *ppos,
+                                              size_t count,
+                                              subbuf_actor_t subbuf_actor,
+                                              read_actor_t actor,
+                                              void *target)
+{
+        struct rchan_buf *buf = filp->private_data;
+        size_t read_start, avail;
+        read_descriptor_t desc;
+        int ret;
+        if (!count)
+                return 0;
+        desc.written = 0;
+        desc.count = count;
+        desc.arg.data = target;
+        desc.error = 0;
+        mutex_lock(&filp->f_dentry->d_inode->i_mutex);
+        do {
+                if (!relay_file_read_avail(buf, *ppos))
+                        break;
+                read_start = relay_file_read_start_pos(*ppos, buf);
+                avail = relay_file_read_subbuf_avail(read_start, buf);
+                if (!avail)
+                        break;
+                avail = min(desc.count, avail);
+                ret = subbuf_actor(read_start, buf, avail, &desc, actor);
+                if (desc.error < 0)
+                        break;
+                if (ret) {
+                        relay_file_read_consume(buf, read_start, ret);
+                        *ppos = relay_file_read_end_pos(buf, read_start, ret);
+                }
+        } while (desc.count && ret);
+        mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
+        return desc.written;
+}
+static ssize_t relay_file_read(struct file *filp,
+                               char __user *buffer,
+                               size_t count,
+                               loff_t *ppos)
+{
+        return relay_file_read_subbufs(filp, ppos, count, subbuf_read_actor,
+                                       NULL, buffer);
+}
+static ssize_t relay_file_sendfile(struct file *filp,
+                                   loff_t *ppos,
+                                   size_t count,
+                                   read_actor_t actor,
+                                   void *target)
+{
+        return relay_file_read_subbufs(filp, ppos, count, subbuf_send_actor,
+                                       actor, target);
+}
+struct file_operations relay_file_operations = {
+        .open           = relay_file_open,
+        .poll           = relay_file_poll,
+        .mmap           = relay_file_mmap,
+        .read           = relay_file_read,
+        .llseek         = no_llseek,
+        .release        = relay_file_release,
+        .sendfile       = relay_file_sendfile,
+};
+EXPORT_SYMBOL_GPL(relay_file_operations);
diff --git a/kernel/sched.c b/kernel/sched.c
index bc38804e40..c13f1bd2df 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -49,6 +49,7 @@
 #include <linux/syscalls.h>
 #include <linux/times.h>
 #include <linux/acct.h>
+#include <linux/kprobes.h>
 #include <asm/tlb.h>
 #include <asm/unistd.h>
@@ -144,7 +145,8 @@
        (v1) * (v2_max) / (v1_max)
 #define DELTA(p) \
-        (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
+        (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
+                INTERACTIVE_DELTA)
 #define TASK_INTERACTIVE(p) \
        ((p)->prio <= (p)->static_prio - DELTA(p))
@@ -178,13 +180,6 @@ static unsigned int task_timeslice(task_t *p)
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)       \
                                < (long long) (sd)->cache_hot_time)
-void __put_task_struct_cb(struct rcu_head *rhp)
-{
-        __put_task_struct(container_of(rhp, struct task_struct, rcu));
-}
-EXPORT_SYMBOL_GPL(__put_task_struct_cb);
 /*
 * These are the runqueue data structures:
 */
@@ -215,7 +210,6 @@ struct runqueue {
         */
        unsigned long nr_running;
 #ifdef CONFIG_SMP
-        unsigned long prio_bias;
        unsigned long cpu_load[3];
 #endif
        unsigned long long nr_switches;
@@ -245,6 +239,7 @@ struct runqueue {
        task_t *migration_thread;
        struct list_head migration_queue;
+        int cpu;
 #endif
 #ifdef CONFIG_SCHEDSTATS
@@ -669,68 +664,17 @@ static int effective_prio(task_t *p)
        return prio;
 }
-#ifdef CONFIG_SMP
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
-{
-        rq->prio_bias += MAX_PRIO - prio;
-}
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
-{
-        rq->prio_bias -= MAX_PRIO - prio;
-}
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
-{
-        rq->nr_running++;
-        if (rt_task(p)) {
-                if (p != rq->migration_thread)
-                        /*
-                         * The migration thread does the actual balancing. Do
-                         * not bias by its priority as the ultra high priority
-                         * will skew balancing adversely.
-                         */
-                        inc_prio_bias(rq, p->prio);
-        } else
-                inc_prio_bias(rq, p->static_prio);
-}
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
-{
-        rq->nr_running--;
-        if (rt_task(p)) {
-                if (p != rq->migration_thread)
-                        dec_prio_bias(rq, p->prio);
-        } else
-                dec_prio_bias(rq, p->static_prio);
-}
-#else
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
-{
-}
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
-{
-}
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
-{
-        rq->nr_running++;
-}
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
-{
-        rq->nr_running--;
-}
-#endif
 /*
 * __activate_task - move a task to the runqueue.
 */
-static inline void __activate_task(task_t *p, runqueue_t *rq)
+static void __activate_task(task_t *p, runqueue_t *rq)
 {
-        enqueue_task(p, rq->active);
+        prio_array_t *target = rq->active;
-        inc_nr_running(p, rq);
+        if (batch_task(p))
+                target = rq->expired;
+        enqueue_task(p, target);
+        rq->nr_running++;
 }
 /*
@@ -739,7 +683,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
        enqueue_task_head(p, rq->active);
-        inc_nr_running(p, rq);
+        rq->nr_running++;
 }
 static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -748,7 +692,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
        unsigned long long __sleep_time = now - p->timestamp;
        unsigned long sleep_time;
-        if (unlikely(p->policy == SCHED_BATCH))
+        if (batch_task(p))
                sleep_time = 0;
        else {
                if (__sleep_time > NS_MAX_SLEEP_AVG)
@@ -760,27 +704,25 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
        if (likely(sleep_time > 0)) {
                /*
                 * User tasks that sleep a long time are categorised as
-                 * idle and will get just interactive status to stay active &
+                 * idle. They will only have their sleep_avg increased to a
-                 * prevent them suddenly becoming cpu hogs and starving
+                 * level that makes them just interactive priority to stay
-                 * other processes.
+                 * active yet prevent them suddenly becoming cpu hogs and
+                 * starving other processes.
                 */
-                if (p->mm && p->activated != -1 &&
+                if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
-                        sleep_time > INTERACTIVE_SLEEP(p)) {
+                                unsigned long ceiling;
-                                p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
-                                                DEF_TIMESLICE);
-                } else {
-                        /*
-                         * The lower the sleep avg a task has the more
-                         * rapidly it will rise with sleep time.
-                         */
-                        sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
+                                ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
+                                        DEF_TIMESLICE);
+                                if (p->sleep_avg < ceiling)
+                                        p->sleep_avg = ceiling;
+                } else {
                        /*
                         * Tasks waking from uninterruptible sleep are
                         * limited in their sleep_avg rise as they
                         * are likely to be waiting on I/O
                         */
-                        if (p->activated == -1 && p->mm) {
+                        if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
                                if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
                                        sleep_time = 0;
                                else if (p->sleep_avg + sleep_time >=
@@ -835,7 +777,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
         * This checks to make sure it's not an uninterruptible task
         * that is now waking up.
         */
-        if (!p->activated) {
+        if (p->sleep_type == SLEEP_NORMAL) {
                /*
                 * Tasks which were woken up by interrupts (ie. hw events)
                 * are most likely of interactive nature. So we give them
@@ -844,13 +786,13 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
                 * on a CPU, first time around:
                 */
                if (in_interrupt())
-                        p->activated = 2;
+                        p->sleep_type = SLEEP_INTERRUPTED;
                else {
                        /*
                         * Normal first-time wakeups get a credit too for
                         * on-runqueue time, but it will be weighted down:
                         */
-                        p->activated = 1;
+                        p->sleep_type = SLEEP_INTERACTIVE;
                }
        }
        p->timestamp = now;
@@ -863,7 +805,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
 */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-        dec_nr_running(p, rq);
+        rq->nr_running--;
        dequeue_task(p, p->array);
        p->array = NULL;
 }
@@ -1007,61 +949,27 @@ void kick_process(task_t *p)
 * We want to under-estimate the load of migration sources, to
 * balance conservatively.
 */
-static unsigned long __source_load(int cpu, int type, enum idle_type idle)
+static inline unsigned long source_load(int cpu, int type)
 {
        runqueue_t *rq = cpu_rq(cpu);
-        unsigned long running = rq->nr_running;
+        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
-        unsigned long source_load, cpu_load = rq->cpu_load[type-1],
-                load_now = running * SCHED_LOAD_SCALE;
        if (type == 0)
-                source_load = load_now;
+                return load_now;
-        else
-                source_load = min(cpu_load, load_now);
-        if (running > 1 || (idle == NOT_IDLE && running))
+        return min(rq->cpu_load[type-1], load_now);
-                /*
-                 * If we are busy rebalancing the load is biased by
-                 * priority to create 'nice' support across cpus. When
-                 * idle rebalancing we should only bias the source_load if
-                 * there is more than one task running on that queue to
-                 * prevent idle rebalance from trying to pull tasks from a
-                 * queue with only one running task.
-                 */
-                source_load = source_load * rq->prio_bias / running;
-        return source_load;
-}
-static inline unsigned long source_load(int cpu, int type)
-{
-        return __source_load(cpu, type, NOT_IDLE);
 }
 /*
 * Return a high guess at the load of a migration-target cpu
 */
-static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
+static inline unsigned long target_load(int cpu, int type)
 {
        runqueue_t *rq = cpu_rq(cpu);
-        unsigned long running = rq->nr_running;
+        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
-        unsigned long target_load, cpu_load = rq->cpu_load[type-1],
-                load_now = running * SCHED_LOAD_SCALE;
        if (type == 0)
-                target_load = load_now;
+                return load_now;
-        else
-                target_load = max(cpu_load, load_now);
-        if (running > 1 || (idle == NOT_IDLE && running))
+        return max(rq->cpu_load[type-1], load_now);
-                target_load = target_load * rq->prio_bias / running;
-        return target_load;
-}
-static inline unsigned long target_load(int cpu, int type)
-{
-        return __target_load(cpu, type, NOT_IDLE);
 }
 /*
@@ -1294,9 +1202,6 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
                }
        }
-        if (p->last_waker_cpu != this_cpu)
-                goto out_set_cpu;
        if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
                goto out_set_cpu;
@@ -1367,8 +1272,6 @@ out_set_cpu:
                cpu = task_cpu(p);
        }
-        p->last_waker_cpu = this_cpu;
 out_activate:
 #endif /* CONFIG_SMP */
        if (old_state == TASK_UNINTERRUPTIBLE) {
@@ -1377,19 +1280,19 @@ out_activate:
                 * Tasks on involuntary sleep don't earn
                 * sleep_avg beyond just interactive state.
                 */
-                p->activated = -1;
+                p->sleep_type = SLEEP_NONINTERACTIVE;
-        }
+        } else
        /*
         * Tasks that have marked their sleep as noninteractive get
-         * woken up without updating their sleep average. (i.e. their
+         * woken up with their sleep average not weighted in an
-         * sleep is handled in a priority-neutral manner, no priority
+         * interactive way.
-         * boost and no penalty.)
         */
-        if (old_state & TASK_NONINTERACTIVE)
+                if (old_state & TASK_NONINTERACTIVE)
-                __activate_task(p, rq);
+                        p->sleep_type = SLEEP_NONINTERACTIVE;
-        else
-                activate_task(p, rq, cpu == this_cpu);
+        activate_task(p, rq, cpu == this_cpu);
        /*
         * Sync wakeups (i.e. those types of wakeups where the waker
         * has indicated that it will leave the CPU in short order)
@@ -1450,12 +1353,9 @@ void fastcall sched_fork(task_t *p, int clone_flags)
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP)
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
-        p->last_waker_cpu = cpu;
-#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
        p->oncpu = 0;
 #endif
-#endif
 #ifdef CONFIG_PREEMPT
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
@@ -1530,7 +1430,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
                                list_add_tail(&p->run_list, &current->run_list);
                                p->array = current->array;
                                p->array->nr_active++;
-                                inc_nr_running(p, rq);
+                                rq->nr_running++;
                        }
                        set_need_resched();
                } else
@@ -1656,8 +1556,14 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
        finish_lock_switch(rq, prev);
        if (mm)
                mmdrop(mm);
-        if (unlikely(prev_task_flags & PF_DEAD))
+        if (unlikely(prev_task_flags & PF_DEAD)) {
+                /*
+                 * Remove function-return probe instances associated with this
+                 * task and put them back on the free list.
+                 */
+                kprobe_flush_task(prev);
                put_task_struct(prev);
+        }
 }
 /**
@@ -1727,7 +1633,7 @@ unsigned long nr_uninterruptible(void)
 {
        unsigned long i, sum = 0;
-        for_each_cpu(i)
+        for_each_possible_cpu(i)
                sum += cpu_rq(i)->nr_uninterruptible;
        /*
@@ -1744,7 +1650,7 @@ unsigned long long nr_context_switches(void)
 {
        unsigned long long i, sum = 0;
-        for_each_cpu(i)
+        for_each_possible_cpu(i)
                sum += cpu_rq(i)->nr_switches;
        return sum;
@@ -1754,17 +1660,35 @@ unsigned long nr_iowait(void)
 {
        unsigned long i, sum = 0;
-        for_each_cpu(i)
+        for_each_possible_cpu(i)
                sum += atomic_read(&cpu_rq(i)->nr_iowait);
        return sum;
 }
+unsigned long nr_active(void)
+{
+        unsigned long i, running = 0, uninterruptible = 0;
+        for_each_online_cpu(i) {
+                running += cpu_rq(i)->nr_running;
+                uninterruptible += cpu_rq(i)->nr_uninterruptible;
+        }
+        if (unlikely((long)uninterruptible < 0))
+                uninterruptible = 0;
+        return running + uninterruptible;
+}
 #ifdef CONFIG_SMP
 /*
 * double_rq_lock - safely lock two runqueues
 *
+ * We must take them in cpu order to match code in
+ * dependent_sleeper and wake_dependent_sleeper.
+ *
 * Note this does not disable interrupts like task_rq_lock,
 * you need to do so manually before calling.
 */
@@ -1776,7 +1700,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
                spin_lock(&rq1->lock);
                __acquire(rq2->lock);   /* Fake it out ;) */
        } else {
-                if (rq1 < rq2) {
+                if (rq1->cpu < rq2->cpu) {
                        spin_lock(&rq1->lock);
                        spin_lock(&rq2->lock);
                } else {
@@ -1812,7 +1736,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
        __acquires(this_rq->lock)
 {
        if (unlikely(!spin_trylock(&busiest->lock))) {
-                if (busiest < this_rq) {
+                if (busiest->cpu < this_rq->cpu) {
                        spin_unlock(&this_rq->lock);
                        spin_lock(&busiest->lock);
                        spin_lock(&this_rq->lock);
@@ -1875,9 +1799,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
               runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
        dequeue_task(p, src_array);
-        dec_nr_running(p, src_rq);
+        src_rq->nr_running--;
        set_task_cpu(p, this_cpu);
-        inc_nr_running(p, this_rq);
+        this_rq->nr_running++;
        enqueue_task(p, this_array);
        p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
                                + this_rq->timestamp_last_tick;
@@ -2056,9 +1980,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
-                                load = __target_load(i, load_idx, idle);
+                                load = target_load(i, load_idx);
                        else
-                                load = __source_load(i, load_idx, idle);
+                                load = source_load(i, load_idx);
                        avg_load += load;
                }
@@ -2171,7 +2095,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
        int i;
        for_each_cpu_mask(i, group->cpumask) {
-                load = __source_load(i, 0, idle);
+                load = source_load(i, 0);
                if (load > max_load) {
                        max_load = load;
@@ -2959,6 +2883,12 @@ EXPORT_SYMBOL(sub_preempt_count);
 #endif
+static inline int interactive_sleep(enum sleep_type sleep_type)
+{
+        return (sleep_type == SLEEP_INTERACTIVE ||
+                sleep_type == SLEEP_INTERRUPTED);
+}
 /*
 * schedule() is the main scheduler function.
 */
@@ -2978,13 +2908,11 @@ asmlinkage void __sched schedule(void)
         * schedule() atomically, we ignore that path for now.
         * Otherwise, whine if we are scheduling when we should not be.
         */
-        if (likely(!current->exit_state)) {
+        if (unlikely(in_atomic() && !current->exit_state)) {
-                if (unlikely(in_atomic())) {
+                printk(KERN_ERR "BUG: scheduling while atomic: "
-                        printk(KERN_ERR "scheduling while atomic: "
+                        "%s/0x%08x/%d\n",
-                                "%s/0x%08x/%d\n",
+                        current->comm, preempt_count(), current->pid);
-                                current->comm, preempt_count(), current->pid);
+                dump_stack();
-                        dump_stack();
-                }
        }
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -3084,12 +3012,12 @@ go_idle:
        queue = array->queue + idx;
        next = list_entry(queue->next, task_t, run_list);
-        if (!rt_task(next) && next->activated > 0) {
+        if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
                unsigned long long delta = now - next->timestamp;
                if (unlikely((long long)(now - next->timestamp) < 0))
                        delta = 0;
-                if (next->activated == 1)
+                if (next->sleep_type == SLEEP_INTERACTIVE)
                        delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
                array = next->array;
@@ -3099,10 +3027,9 @@ go_idle:
                        dequeue_task(next, array);
                        next->prio = new_prio;
                        enqueue_task(next, array);
-                } else
+                }
-                        requeue_task(next, array);
        }
-        next->activated = 0;
+        next->sleep_type = SLEEP_NORMAL;
 switch_tasks:
        if (next == rq->idle)
                schedstat_inc(rq, sched_goidle);
@@ -3571,10 +3498,8 @@ void set_user_nice(task_t *p, long nice)
                goto out_unlock;
        }
        array = p->array;
-        if (array) {
+        if (array)
                dequeue_task(p, array);
-                dec_prio_bias(rq, p->static_prio);
-        }
        old_prio = p->prio;
        new_prio = NICE_TO_PRIO(nice);
@@ -3584,7 +3509,6 @@ void set_user_nice(task_t *p, long nice)
        if (array) {
                enqueue_task(p, array);
-                inc_prio_bias(rq, p->static_prio);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -4129,6 +4053,8 @@ static inline void __cond_resched(void)
         */
        if (unlikely(preempt_count()))
                return;
+        if (unlikely(system_state != SYSTEM_RUNNING))
+                return;
        do {
                add_preempt_count(PREEMPT_ACTIVE);
                schedule();
@@ -4434,6 +4360,7 @@ void __devinit init_idle(task_t *idle, int cpu)
        runqueue_t *rq = cpu_rq(cpu);
        unsigned long flags;
+        idle->timestamp = sched_clock();
        idle->sleep_avg = 0;
        idle->array = NULL;
        idle->prio = MAX_PRIO;
@@ -4861,7 +4788,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
 /* Register at highest priority so that task migration (migrate_all_tasks)
 * happens before everything else.
 */
-static struct notifier_block __devinitdata migration_notifier = {
+static struct notifier_block migration_notifier = {
        .notifier_call = migration_call,
        .priority = 10
 };
@@ -5159,7 +5086,18 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
 #define MAX_DOMAIN_DISTANCE 32
 static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
-                { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
+                { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
+/*
+ * Architectures may override the migration cost and thus avoid
+ * boot-time calibration. Unit is nanoseconds. Mostly useful for
+ * virtualized hardware:
+ */
+#ifdef CONFIG_DEFAULT_MIGRATION_COST
+                        CONFIG_DEFAULT_MIGRATION_COST
+#else
+                        -1LL
+#endif
+};
 /*
 * Allow override of migration cost - in units of microseconds.
@@ -5664,11 +5602,31 @@ static int cpu_to_cpu_group(int cpu)
 }
 #endif
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group sched_group_core[NR_CPUS];
+#endif
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+        return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+        return cpu;
+}
+#endif
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
 static int cpu_to_phys_group(int cpu)
 {
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_MC)
+        cpumask_t mask = cpu_coregroup_map(cpu);
+        return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
        return first_cpu(cpu_sibling_map[cpu]);
 #else
        return cpu;
@@ -5691,6 +5649,32 @@ static int cpu_to_allnodes_group(int cpu)
 {
        return cpu_to_node(cpu);
 }
+static void init_numa_sched_groups_power(struct sched_group *group_head)
+{
+        struct sched_group *sg = group_head;
+        int j;
+        if (!sg)
+                return;
+next_sg:
+        for_each_cpu_mask(j, sg->cpumask) {
+                struct sched_domain *sd;
+                sd = &per_cpu(phys_domains, j);
+                if (j != first_cpu(sd->groups->cpumask)) {
+                        /*
+                         * Only add "power" once for each
+                         * physical package.
+                         */
+                        continue;
+                }
+                sg->cpu_power += sd->groups->cpu_power;
+        }
+        sg = sg->next;
+        if (sg != group_head)
+                goto next_sg;
+}
 #endif
 /*
@@ -5766,6 +5750,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
                sd->parent = p;
                sd->groups = &sched_group_phys[group];
+#ifdef CONFIG_SCHED_MC
+                p = sd;
+                sd = &per_cpu(core_domains, i);
+                group = cpu_to_core_group(i);
+                *sd = SD_MC_INIT;
+                sd->span = cpu_coregroup_map(i);
+                cpus_and(sd->span, sd->span, *cpu_map);
+                sd->parent = p;
+                sd->groups = &sched_group_core[group];
+#endif
 #ifdef CONFIG_SCHED_SMT
                p = sd;
                sd = &per_cpu(cpu_domains, i);
@@ -5791,6 +5786,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
        }
 #endif
+#ifdef CONFIG_SCHED_MC
+        /* Set up multi-core groups */
+        for_each_cpu_mask(i, *cpu_map) {
+                cpumask_t this_core_map = cpu_coregroup_map(i);
+                cpus_and(this_core_map, this_core_map, *cpu_map);
+                if (i != first_cpu(this_core_map))
+                        continue;
+                init_sched_build_groups(sched_group_core, this_core_map,
+                                        &cpu_to_core_group);
+        }
+#endif
        /* Set up physical groups */
        for (i = 0; i < MAX_NUMNODES; i++) {
                cpumask_t nodemask = node_to_cpumask(i);
@@ -5887,51 +5895,38 @@ void build_sched_domains(const cpumask_t *cpu_map)
                power = SCHED_LOAD_SCALE;
                sd->groups->cpu_power = power;
 #endif
+#ifdef CONFIG_SCHED_MC
+                sd = &per_cpu(core_domains, i);
+                power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+                                            * SCHED_LOAD_SCALE / 10;
+                sd->groups->cpu_power = power;
                sd = &per_cpu(phys_domains, i);
+                /*
+                 * This has to be < 2 * SCHED_LOAD_SCALE
+                 * Lets keep it SCHED_LOAD_SCALE, so that
+                 * while calculating NUMA group's cpu_power
+                 * we can simply do
+                 *  numa_group->cpu_power += phys_group->cpu_power;
+                 *
+                 * See "only add power once for each physical pkg"
+                 * comment below
+                 */
+                sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
+                sd = &per_cpu(phys_domains, i);
                power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
                                (cpus_weight(sd->groups->cpumask)-1) / 10;
                sd->groups->cpu_power = power;
-#ifdef CONFIG_NUMA
-                sd = &per_cpu(allnodes_domains, i);
-                if (sd->groups) {
-                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                                (cpus_weight(sd->groups->cpumask)-1) / 10;
-                        sd->groups->cpu_power = power;
-                }
 #endif
        }
 #ifdef CONFIG_NUMA
-        for (i = 0; i < MAX_NUMNODES; i++) {
+        for (i = 0; i < MAX_NUMNODES; i++)
-                struct sched_group *sg = sched_group_nodes[i];
+                init_numa_sched_groups_power(sched_group_nodes[i]);
-                int j;
-                if (sg == NULL)
-                        continue;
-next_sg:
-                for_each_cpu_mask(j, sg->cpumask) {
-                        struct sched_domain *sd;
-                        int power;
-                        sd = &per_cpu(phys_domains, j);
-                        if (j != first_cpu(sd->groups->cpumask)) {
-                                /*
-                                 * Only add "power" once for each
-                                 * physical package.
-                                 */
-                                continue;
-                        }
-                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                                (cpus_weight(sd->groups->cpumask)-1) / 10;
-                        sg->cpu_power += power;
+        init_numa_sched_groups_power(sched_group_allnodes);
-                }
-                sg = sg->next;
-                if (sg != sched_group_nodes[i])
-                        goto next_sg;
-        }
 #endif
        /* Attach the domains */
@@ -5939,6 +5934,8 @@ next_sg:
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+                sd = &per_cpu(core_domains, i);
 #else
                sd = &per_cpu(phys_domains, i);
 #endif
@@ -6111,7 +6108,7 @@ void __init sched_init(void)
        runqueue_t *rq;
        int i, j, k;
-        for_each_cpu(i) {
+        for_each_possible_cpu(i) {
                prio_array_t *array;
                rq = cpu_rq(i);
@@ -6129,6 +6126,7 @@ void __init sched_init(void)
                rq->push_cpu = 0;
                rq->migration_thread = NULL;
                INIT_LIST_HEAD(&rq->migration_queue);
+                rq->cpu = i;
 #endif
                atomic_set(&rq->nr_iowait, 0);
@@ -6169,7 +6167,7 @@ void __might_sleep(char *file, int line)
                if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                        return;
                prev_jiffy = jiffies;
-                printk(KERN_ERR "Debug: sleeping function called from invalid"
+                printk(KERN_ERR "BUG: sleeping function called from invalid"
                                " context at %s:%d\n", file, line);
                printk("in_atomic():%d, irqs_disabled():%d\n",
                        in_atomic(), irqs_disabled());
diff --git a/kernel/signal.c b/kernel/signal.c
index b373fc2420..e5f8aea78f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,7 +22,6 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/ptrace.h>
-#include <linux/posix-timers.h>
 #include <linux/signal.h>
 #include <linux/audit.h>
 #include <linux/capability.h>
@@ -147,6 +146,8 @@ static kmem_cache_t *sigqueue_cachep;
 #define sig_kernel_stop(sig) \
                (((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_STOP_MASK))
+#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
 #define sig_user_defined(t, signr) \
        (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&  \
         ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
@@ -292,7 +293,7 @@ static void __sigqueue_free(struct sigqueue *q)
        kmem_cache_free(sigqueue_cachep, q);
 }
-static void flush_sigqueue(struct sigpending *queue)
+void flush_sigqueue(struct sigpending *queue)
 {
        struct sigqueue *q;
@@ -307,9 +308,7 @@ static void flush_sigqueue(struct sigpending *queue)
 /*
 * Flush all pending signals for a task.
 */
+void flush_signals(struct task_struct *t)
-void
-flush_signals(struct task_struct *t)
 {
        unsigned long flags;
@@ -321,109 +320,6 @@ flush_signals(struct task_struct *t)
 }
 /*
- * This function expects the tasklist_lock write-locked.
- */
-void __exit_sighand(struct task_struct *tsk)
-{
-        struct sighand_struct * sighand = tsk->sighand;
-        /* Ok, we're done with the signal handlers */
-        tsk->sighand = NULL;
-        if (atomic_dec_and_test(&sighand->count))
-                sighand_free(sighand);
-}
-void exit_sighand(struct task_struct *tsk)
-{
-        write_lock_irq(&tasklist_lock);
-        rcu_read_lock();
-        if (tsk->sighand != NULL) {
-                struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
-                spin_lock(&sighand->siglock);
-                __exit_sighand(tsk);
-                spin_unlock(&sighand->siglock);
-        }
-        rcu_read_unlock();
-        write_unlock_irq(&tasklist_lock);
-}
-/*
- * This function expects the tasklist_lock write-locked.
- */
-void __exit_signal(struct task_struct *tsk)
-{
-        struct signal_struct * sig = tsk->signal;
-        struct sighand_struct * sighand;
-        if (!sig)
-                BUG();
-        if (!atomic_read(&sig->count))
-                BUG();
-        rcu_read_lock();
-        sighand = rcu_dereference(tsk->sighand);
-        spin_lock(&sighand->siglock);
-        posix_cpu_timers_exit(tsk);
-        if (atomic_dec_and_test(&sig->count)) {
-                posix_cpu_timers_exit_group(tsk);
-                tsk->signal = NULL;
-                __exit_sighand(tsk);
-                spin_unlock(&sighand->siglock);
-                flush_sigqueue(&sig->shared_pending);
-        } else {
-                /*
-                 * If there is any task waiting for the group exit
-                 * then notify it:
-                 */
-                if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
-                        wake_up_process(sig->group_exit_task);
-                        sig->group_exit_task = NULL;
-                }
-                if (tsk == sig->curr_target)
-                        sig->curr_target = next_thread(tsk);
-                tsk->signal = NULL;
-                /*
-                 * Accumulate here the counters for all threads but the
-                 * group leader as they die, so they can be added into
-                 * the process-wide totals when those are taken.
-                 * The group leader stays around as a zombie as long
-                 * as there are other threads.  When it gets reaped,
-                 * the exit.c code will add its counts into these totals.
-                 * We won't ever get here for the group leader, since it
-                 * will have been the last reference on the signal_struct.
-                 */
-                sig->utime = cputime_add(sig->utime, tsk->utime);
-                sig->stime = cputime_add(sig->stime, tsk->stime);
-                sig->min_flt += tsk->min_flt;
-                sig->maj_flt += tsk->maj_flt;
-                sig->nvcsw += tsk->nvcsw;
-                sig->nivcsw += tsk->nivcsw;
-                sig->sched_time += tsk->sched_time;
-                __exit_sighand(tsk);
-                spin_unlock(&sighand->siglock);
-                sig = NULL;     /* Marker for below.  */
-        }
-        rcu_read_unlock();
-        clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
-        flush_sigqueue(&tsk->pending);
-        if (sig) {
-                /*
-                 * We are cleaning up the signal_struct here.
-                 */
-                exit_thread_group_keys(sig);
-                kmem_cache_free(signal_cachep, sig);
-        }
-}
-void exit_signal(struct task_struct *tsk)
-{
-        atomic_dec(&tsk->signal->live);
-        write_lock_irq(&tasklist_lock);
-        __exit_signal(tsk);
-        write_unlock_irq(&tasklist_lock);
-}
-/*
 * Flush all handlers for a task.
 */
@@ -695,9 +591,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 }
 /* forward decl */
-static void do_notify_parent_cldstop(struct task_struct *tsk,
+static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
-                                     int to_self,
-                                     int why);
 /*
 * Handle magic process-wide effects of stop/continue signals.
@@ -747,7 +641,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
                        p->signal->group_stop_count = 0;
                        p->signal->flags = SIGNAL_STOP_CONTINUED;
                        spin_unlock(&p->sighand->siglock);
-                        do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED);
+                        do_notify_parent_cldstop(p, CLD_STOPPED);
                        spin_lock(&p->sighand->siglock);
                }
                rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -788,7 +682,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
                        p->signal->flags = SIGNAL_STOP_CONTINUED;
                        p->signal->group_exit_code = 0;
                        spin_unlock(&p->sighand->siglock);
-                        do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED);
+                        do_notify_parent_cldstop(p, CLD_CONTINUED);
                        spin_lock(&p->sighand->siglock);
                } else {
                        /*
@@ -875,8 +769,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
        int ret = 0;
-        if (!irqs_disabled())
+        BUG_ON(!irqs_disabled());
-                BUG();
        assert_spin_locked(&t->sighand->siglock);
        /* Short-circuit ignored signals.  */
@@ -975,7 +868,6 @@ __group_complete_signal(int sig, struct task_struct *p)
                if (t == NULL)
                        /* restart balancing at this thread */
                        t = p->signal->curr_target = p;
-                BUG_ON(t->tgid != p->tgid);
                while (!wants_signal(sig, t)) {
                        t = next_thread(t);
@@ -1120,27 +1012,37 @@ void zap_other_threads(struct task_struct *p)
 /*
 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
 */
+struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
+{
+        struct sighand_struct *sighand;
+        for (;;) {
+                sighand = rcu_dereference(tsk->sighand);
+                if (unlikely(sighand == NULL))
+                        break;
+                spin_lock_irqsave(&sighand->siglock, *flags);
+                if (likely(sighand == tsk->sighand))
+                        break;
+                spin_unlock_irqrestore(&sighand->siglock, *flags);
+        }
+        return sighand;
+}
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
        unsigned long flags;
-        struct sighand_struct *sp;
        int ret;
-retry:
        ret = check_kill_permission(sig, info, p);
-        if (!ret && sig && (sp = rcu_dereference(p->sighand))) {
-                spin_lock_irqsave(&sp->siglock, flags);
+        if (!ret && sig) {
-                if (p->sighand != sp) {
+                ret = -ESRCH;
-                        spin_unlock_irqrestore(&sp->siglock, flags);
+                if (lock_task_sighand(p, &flags)) {
-                        goto retry;
+                        ret = __group_send_sig_info(sig, info, p);
-                }
+                        unlock_task_sighand(p, &flags);
-                if ((atomic_read(&sp->count) == 0) ||
-                                (atomic_read(&p->usage) == 0)) {
-                        spin_unlock_irqrestore(&sp->siglock, flags);
-                        return -ESRCH;
                }
-                ret = __group_send_sig_info(sig, info, p);
-                spin_unlock_irqrestore(&sp->siglock, flags);
        }
        return ret;
@@ -1189,7 +1091,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
        struct task_struct *p;
        rcu_read_lock();
-        if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
+        if (unlikely(sig_needs_tasklist(sig))) {
                read_lock(&tasklist_lock);
                acquired_tasklist_lock = 1;
        }
@@ -1405,12 +1307,10 @@ void sigqueue_free(struct sigqueue *q)
        __sigqueue_free(q);
 }
-int
+int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
-send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 {
        unsigned long flags;
        int ret = 0;
-        struct sighand_struct *sh;
        BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
@@ -1424,48 +1324,17 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
         */
        rcu_read_lock();
-        if (unlikely(p->flags & PF_EXITING)) {
+        if (!likely(lock_task_sighand(p, &flags))) {
                ret = -1;
                goto out_err;
        }
-retry:
-        sh = rcu_dereference(p->sighand);
-        spin_lock_irqsave(&sh->siglock, flags);
-        if (p->sighand != sh) {
-                /* We raced with exec() in a multithreaded process... */
-                spin_unlock_irqrestore(&sh->siglock, flags);
-                goto retry;
-        }
-        /*
-         * We do the check here again to handle the following scenario:
-         *
-         * CPU 0                CPU 1
-         * send_sigqueue
-         * check PF_EXITING
-         * interrupt            exit code running
-         *                      __exit_signal
-         *                      lock sighand->siglock
-         *                      unlock sighand->siglock
-         * lock sh->siglock
-         * add(tsk->pending)    flush_sigqueue(tsk->pending)
-         *
-         */
-        if (unlikely(p->flags & PF_EXITING)) {
-                ret = -1;
-                goto out;
-        }
        if (unlikely(!list_empty(&q->list))) {
                /*
                 * If an SI_TIMER entry is already queue just increment
                 * the overrun count.
                 */
-                if (q->info.si_code != SI_TIMER)
+                BUG_ON(q->info.si_code != SI_TIMER);
-                        BUG();
                q->info.si_overrun++;
                goto out;
        }
@@ -1481,7 +1350,7 @@ retry:
                signal_wake_up(p, sig == SIGKILL);
 out:
-        spin_unlock_irqrestore(&sh->siglock, flags);
+        unlock_task_sighand(p, &flags);
 out_err:
        rcu_read_unlock();
@@ -1513,8 +1382,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
                 * the overrun count.  Other uses should not try to
                 * send the signal multiple times.
                 */
-                if (q->info.si_code != SI_TIMER)
+                BUG_ON(q->info.si_code != SI_TIMER);
-                        BUG();
                q->info.si_overrun++;
                goto out;
        } 
@@ -1613,14 +1481,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
        spin_unlock_irqrestore(&psig->siglock, flags);
 }
-static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why)
+static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 {
        struct siginfo info;
        unsigned long flags;
        struct task_struct *parent;
        struct sighand_struct *sighand;
-        if (to_self)
+        if (tsk->ptrace & PT_PTRACED)
                parent = tsk->parent;
        else {
                tsk = tsk->group_leader;
@@ -1689,13 +1557,14 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
        /* Let the debugger run.  */
        set_current_state(TASK_TRACED);
        spin_unlock_irq(&current->sighand->siglock);
+        try_to_freeze();
        read_lock(&tasklist_lock);
        if (likely(current->ptrace & PT_PTRACED) &&
            likely(current->parent != current->real_parent ||
                   !(current->ptrace & PT_ATTACHED)) &&
            (likely(current->parent->signal != current->signal) ||
             !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
-                do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
+                do_notify_parent_cldstop(current, CLD_TRAPPED);
                read_unlock(&tasklist_lock);
                schedule();
        } else {
@@ -1744,25 +1613,17 @@ void ptrace_notify(int exit_code)
 static void
 finish_stop(int stop_count)
 {
-        int to_self;
        /*
         * If there are no other threads in the group, or if there is
         * a group stop in progress and we are the last to stop,
         * report to the parent.  When ptraced, every thread reports itself.
         */
-        if (stop_count < 0 || (current->ptrace & PT_PTRACED))
+        if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
-                to_self = 1;
+                read_lock(&tasklist_lock);
-        else if (stop_count == 0)
+                do_notify_parent_cldstop(current, CLD_STOPPED);
-                to_self = 0;
+                read_unlock(&tasklist_lock);
-        else
+        }
-                goto out;
-        read_lock(&tasklist_lock);
-        do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
-        read_unlock(&tasklist_lock);
-out:
        schedule();
        /*
         * Now we don't run again until continued.
@@ -1776,12 +1637,10 @@ out:
 * Returns nonzero if we've actually stopped and released the siglock.
 * Returns zero if we didn't stop and still hold the siglock.
 */
-static int
+static int do_signal_stop(int signr)
-do_signal_stop(int signr)
 {
        struct signal_struct *sig = current->signal;
-        struct sighand_struct *sighand = current->sighand;
+        int stop_count;
-        int stop_count = -1;
        if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
                return 0;
@@ -1791,86 +1650,37 @@ do_signal_stop(int signr)
                 * There is a group stop in progress.  We don't need to
                 * start another one.
                 */
-                signr = sig->group_exit_code;
                stop_count = --sig->group_stop_count;
-                current->exit_code = signr;
+        } else {
-                set_current_state(TASK_STOPPED);
-                if (stop_count == 0)
-                        sig->flags = SIGNAL_STOP_STOPPED;
-                spin_unlock_irq(&sighand->siglock);
-        }
-        else if (thread_group_empty(current)) {
-                /*
-                 * Lock must be held through transition to stopped state.
-                 */
-                current->exit_code = current->signal->group_exit_code = signr;
-                set_current_state(TASK_STOPPED);
-                sig->flags = SIGNAL_STOP_STOPPED;
-                spin_unlock_irq(&sighand->siglock);
-        }
-        else {
                /*
                 * There is no group stop already in progress.
-                 * We must initiate one now, but that requires
+                 * We must initiate one now.
-                 * dropping siglock to get both the tasklist lock
-                 * and siglock again in the proper order.  Note that
-                 * this allows an intervening SIGCONT to be posted.
-                 * We need to check for that and bail out if necessary.
                 */
                struct task_struct *t;
-                spin_unlock_irq(&sighand->siglock);
+                sig->group_exit_code = signr;
-                /* signals can be posted during this window */
-                read_lock(&tasklist_lock);
-                spin_lock_irq(&sighand->siglock);
-                if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) {
+                stop_count = 0;
+                for (t = next_thread(current); t != current; t = next_thread(t))
                        /*
-                         * Another stop or continue happened while we
+                         * Setting state to TASK_STOPPED for a group
-                         * didn't have the lock.  We can just swallow this
+                         * stop is always done with the siglock held,
-                         * signal now.  If we raced with a SIGCONT, that
+                         * so this check has no races.
-                         * should have just cleared it now.  If we raced
-                         * with another processor delivering a stop signal,
-                         * then the SIGCONT that wakes us up should clear it.
                         */
-                        read_unlock(&tasklist_lock);
+                        if (!t->exit_state &&
-                        return 0;
+                            !(t->state & (TASK_STOPPED|TASK_TRACED))) {
-                }
+                                stop_count++;
+                                signal_wake_up(t, 0);
-                if (sig->group_stop_count == 0) {
+                        }
-                        sig->group_exit_code = signr;
+                sig->group_stop_count = stop_count;
-                        stop_count = 0;
-                        for (t = next_thread(current); t != current;
-                             t = next_thread(t))
-                                /*
-                                 * Setting state to TASK_STOPPED for a group
-                                 * stop is always done with the siglock held,
-                                 * so this check has no races.
-                                 */
-                                if (!t->exit_state &&
-                                    !(t->state & (TASK_STOPPED|TASK_TRACED))) {
-                                        stop_count++;
-                                        signal_wake_up(t, 0);
-                                }
-                        sig->group_stop_count = stop_count;
-                }
-                else {
-                        /* A race with another thread while unlocked.  */
-                        signr = sig->group_exit_code;
-                        stop_count = --sig->group_stop_count;
-                }
-                current->exit_code = signr;
-                set_current_state(TASK_STOPPED);
-                if (stop_count == 0)
-                        sig->flags = SIGNAL_STOP_STOPPED;
-                spin_unlock_irq(&sighand->siglock);
-                read_unlock(&tasklist_lock);
        }
+        if (stop_count == 0)
+                sig->flags = SIGNAL_STOP_STOPPED;
+        current->exit_code = sig->group_exit_code;
+        __set_current_state(TASK_STOPPED);
+        spin_unlock_irq(&current->sighand->siglock);
        finish_stop(stop_count);
        return 1;
 }
@@ -1922,6 +1732,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
        sigset_t *mask = &current->blocked;
        int signr = 0;
+        try_to_freeze();
 relock:
        spin_lock_irq(&current->sighand->siglock);
        for (;;) {
@@ -1942,9 +1754,9 @@ relock:
                        /* Let the debugger run.  */
                        ptrace_stop(signr, signr, info);
-                        /* We're back.  Did the debugger cancel the sig or group_exit? */
+                        /* We're back.  Did the debugger cancel the sig?  */
                        signr = current->exit_code;
-                        if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT)
+                        if (signr == 0)
                                continue;
                        current->exit_code = 0;
@@ -1988,7 +1800,7 @@ relock:
                        continue;
                /* Init gets no signals it doesn't want.  */
-                if (current->pid == 1)
+                if (current == child_reaper)
                        continue;
                if (sig_kernel_stop(signr)) {
@@ -2099,10 +1911,11 @@ long do_no_restart_syscall(struct restart_block *param)
 int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 {
        int error;
-        sigset_t old_block;
        spin_lock_irq(&current->sighand->siglock);
-        old_block = current->blocked;
+        if (oldset)
+                *oldset = current->blocked;
        error = 0;
        switch (how) {
        case SIG_BLOCK:
@@ -2119,8 +1932,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
        }
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
-        if (oldset)
-                *oldset = old_block;
        return error;
 }
@@ -2307,7 +2119,6 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
                        timeout = schedule_timeout_interruptible(timeout);
-                        try_to_freeze();
                        spin_lock_irq(&current->sighand->siglock);
                        sig = dequeue_signal(current, &these, &info);
                        current->blocked = current->real_blocked;
@@ -2429,8 +2240,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
        return kill_proc_info(sig, &info, pid);
 }
-int
+int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
-do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
 {
        struct k_sigaction *k;
        sigset_t mask;
@@ -2454,6 +2264,9 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
                *oact = *k;
        if (act) {
+                sigdelsetmask(&act->sa.sa_mask,
+                              sigmask(SIGKILL) | sigmask(SIGSTOP));
+                *k = *act;
                /*
                 * POSIX 3.3.1.3:
                 *  "Setting a signal action to SIG_IGN for a signal that is
@@ -2466,21 +2279,8 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
                 *   be discarded, whether or not it is blocked"
                 */
                if (act->sa.sa_handler == SIG_IGN ||
-                    (act->sa.sa_handler == SIG_DFL &&
+                   (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) {
-                     sig_kernel_ignore(sig))) {
-                        /*
-                         * This is a fairly rare case, so we only take the
-                         * tasklist_lock once we're sure we'll need it.
-                         * Now we must do this little unlock and relock
-                         * dance to maintain the lock hierarchy.
-                         */
                        struct task_struct *t = current;
-                        spin_unlock_irq(&t->sighand->siglock);
-                        read_lock(&tasklist_lock);
-                        spin_lock_irq(&t->sighand->siglock);
-                        *k = *act;
-                        sigdelsetmask(&k->sa.sa_mask,
-                                      sigmask(SIGKILL) | sigmask(SIGSTOP));
                        sigemptyset(&mask);
                        sigaddset(&mask, sig);
                        rm_from_queue_full(&mask, &t->signal->shared_pending);
@@ -2489,14 +2289,7 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
                                recalc_sigpending_tsk(t);
                                t = next_thread(t);
                        } while (t != current);
-                        spin_unlock_irq(&current->sighand->siglock);
-                        read_unlock(&tasklist_lock);
-                        return 0;
                }
-                *k = *act;
-                sigdelsetmask(&k->sa.sa_mask,
-                              sigmask(SIGKILL) | sigmask(SIGSTOP));
        }
        spin_unlock_irq(&current->sighand->siglock);
@@ -2702,6 +2495,7 @@ sys_signal(int sig, __sighandler_t handler)
        new_sa.sa.sa_handler = handler;
        new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
+        sigemptyset(&new_sa.sa.sa_mask);
        ret = do_sigaction(sig, &new_sa, &old_sa);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ad3295cdde..336f92d64e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/smp.h>
 #include <asm/irq.h>
 /*
@@ -445,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit cpu_callback(struct notifier_block *nfb,
+static int cpu_callback(struct notifier_block *nfb,
                                  unsigned long action,
                                  void *hcpu)
 {
@@ -483,7 +484,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block cpu_nfb = {
        .notifier_call = cpu_callback
 };
@@ -495,3 +496,22 @@ __init int spawn_ksoftirqd(void)
        register_cpu_notifier(&cpu_nfb);
        return 0;
 }
+#ifdef CONFIG_SMP
+/*
+ * Call a function on all processors
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
+{
+        int ret = 0;
+        preempt_disable();
+        ret = smp_call_function(func, info, retry, wait);
+        local_irq_disable();
+        func(info);
+        local_irq_enable();
+        preempt_enable();
+        return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
+#endif
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index c67189a25d..14c7faf029 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -1,12 +1,11 @@
 /*
 * Detect Soft Lockups
 *
- * started by Ingo Molnar, (C) 2005, Red Hat
+ * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
 *
 * this code detects soft lockups: incidents in where on a CPU
 * the kernel does not reschedule for 10 seconds or more.
 */
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/init.h>
@@ -17,13 +16,14 @@
 static DEFINE_SPINLOCK(print_lock);
-static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, touch_timestamp);
-static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 static int did_panic = 0;
-static int softlock_panic(struct notifier_block *this, unsigned long event,
-                                void *ptr)
+static int
+softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
 {
        did_panic = 1;
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
 void touch_softlockup_watchdog(void)
 {
-        per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+        per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies;
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
@@ -44,25 +44,35 @@ EXPORT_SYMBOL(touch_softlockup_watchdog);
 * This callback runs from the timer interrupt, and checks
 * whether the watchdog thread has hung or not:
 */
-void softlockup_tick(struct pt_regs *regs)
+void softlockup_tick(void)
 {
        int this_cpu = smp_processor_id();
-        unsigned long timestamp = per_cpu(timestamp, this_cpu);
+        unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
-        if (per_cpu(print_timestamp, this_cpu) == timestamp)
+        /* prevent double reports: */
+        if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
+                did_panic ||
+                        !per_cpu(watchdog_task, this_cpu))
                return;
-        /* Do not cause a second panic when there already was one */
+        /* do not print during early bootup: */
-        if (did_panic)
+        if (unlikely(system_state != SYSTEM_RUNNING)) {
+                touch_softlockup_watchdog();
                return;
+        }
-        if (time_after(jiffies, timestamp + 10*HZ)) {
+        /* Wake up the high-prio watchdog task every second: */
-                per_cpu(print_timestamp, this_cpu) = timestamp;
+        if (time_after(jiffies, touch_timestamp + HZ))
+                wake_up_process(per_cpu(watchdog_task, this_cpu));
+        /* Warn about unreasonable 10+ seconds delays: */
+        if (time_after(jiffies, touch_timestamp + 10*HZ)) {
+                per_cpu(print_timestamp, this_cpu) = touch_timestamp;
                spin_lock(&print_lock);
                printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
                        this_cpu);
-                show_regs(regs);
+                dump_stack();
                spin_unlock(&print_lock);
        }
 }
@@ -77,18 +87,16 @@ static int watchdog(void * __bind_cpu)
        sched_setscheduler(current, SCHED_FIFO, &param);
        current->flags |= PF_NOFREEZE;
-        set_current_state(TASK_INTERRUPTIBLE);
        /*
-         * Run briefly once per second - if this gets delayed for
+         * Run briefly once per second to reset the softlockup timestamp.
-         * more than 10 seconds then the debug-printout triggers
+         * If this gets delayed for more than 10 seconds then the
-         * in softlockup_tick():
+         * debug-printout triggers in softlockup_tick().
         */
        while (!kthread_should_stop()) {
-                msleep_interruptible(1000);
+                set_current_state(TASK_INTERRUPTIBLE);
                touch_softlockup_watchdog();
+                schedule();
        }
-        __set_current_state(TASK_RUNNING);
        return 0;
 }
@@ -96,7 +104,7 @@ static int watchdog(void * __bind_cpu)
 /*
 * Create/destroy watchdog threads as CPUs come and go:
 */
-static int __devinit
+static int
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int hotcpu = (unsigned long)hcpu;
@@ -110,11 +118,11 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
                        printk("watchdog for %i failed\n", hotcpu);
                        return NOTIFY_BAD;
                }
+                per_cpu(touch_timestamp, hotcpu) = jiffies;
                per_cpu(watchdog_task, hotcpu) = p;
                kthread_bind(p, hotcpu);
                break;
        case CPU_ONLINE:
                wake_up_process(per_cpu(watchdog_task, hotcpu));
                break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -132,7 +140,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block cpu_nfb = {
        .notifier_call = cpu_callback
 };
@@ -144,6 +152,5 @@ __init void spawn_softlockup_task(void)
        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
        register_cpu_notifier(&cpu_nfb);
-        notifier_chain_register(&panic_notifier_list, &panic_block);
+        atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 }
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0375fcd592..d1b810782b 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -179,16 +179,16 @@ EXPORT_SYMBOL(_write_lock);
 #define BUILD_LOCK_OPS(op, locktype)                                    \
 void __lockfunc _##op##_lock(locktype##_t *lock)                        \
 {                                                                       \
-        preempt_disable();                                              \
        for (;;) {                                                      \
+                preempt_disable();                                      \
                if (likely(_raw_##op##_trylock(lock)))                  \
                        break;                                          \
                preempt_enable();                                       \
+                                                                        \
                if (!(lock)->break_lock)                                \
                        (lock)->break_lock = 1;                         \
                while (!op##_can_lock(lock) && (lock)->break_lock)      \
                        cpu_relax();                                    \
-                preempt_disable();                                      \
        }                                                               \
        (lock)->break_lock = 0;                                         \
 }                                                                       \
@@ -199,19 +199,18 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
 {                                                                       \
        unsigned long flags;                                            \
                                                                        \
-        preempt_disable();                                              \
        for (;;) {                                                      \
+                preempt_disable();                                      \
                local_irq_save(flags);                                  \
                if (likely(_raw_##op##_trylock(lock)))                  \
                        break;                                          \
                local_irq_restore(flags);                               \
-                                                                        \
                preempt_enable();                                       \
+                                                                        \
                if (!(lock)->break_lock)                                \
                        (lock)->break_lock = 1;                         \
                while (!op##_can_lock(lock) && (lock)->break_lock)      \
                        cpu_relax();                                    \
-                preempt_disable();                                      \
        }                                                               \
        (lock)->break_lock = 0;                                         \
        return flags;                                                   \
diff --git a/kernel/sys.c b/kernel/sys.c
index f91218a546..0b6ec0e793 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -95,99 +95,304 @@ int cad_pid = 1;
 *      and the like. 
 */
-static struct notifier_block *reboot_notifier_list;
+static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
-static DEFINE_RWLOCK(notifier_lock);
+/*
+ *      Notifier chain core routines.  The exported routines below
+ *      are layered on top of these, with appropriate locking added.
+ */
+static int notifier_chain_register(struct notifier_block **nl,
+                struct notifier_block *n)
+{
+        while ((*nl) != NULL) {
+                if (n->priority > (*nl)->priority)
+                        break;
+                nl = &((*nl)->next);
+        }
+        n->next = *nl;
+        rcu_assign_pointer(*nl, n);
+        return 0;
+}
+static int notifier_chain_unregister(struct notifier_block **nl,
+                struct notifier_block *n)
+{
+        while ((*nl) != NULL) {
+                if ((*nl) == n) {
+                        rcu_assign_pointer(*nl, n->next);
+                        return 0;
+                }
+                nl = &((*nl)->next);
+        }
+        return -ENOENT;
+}
+static int __kprobes notifier_call_chain(struct notifier_block **nl,
+                unsigned long val, void *v)
+{
+        int ret = NOTIFY_DONE;
+        struct notifier_block *nb;
+        nb = rcu_dereference(*nl);
+        while (nb) {
+                ret = nb->notifier_call(nb, val, v);
+                if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+                        break;
+                nb = rcu_dereference(nb->next);
+        }
+        return ret;
+}
+/*
+ *      Atomic notifier chain routines.  Registration and unregistration
+ *      use a mutex, and call_chain is synchronized by RCU (no locks).
+ */
 /**
- *      notifier_chain_register - Add notifier to a notifier chain
+ *      atomic_notifier_chain_register - Add notifier to an atomic notifier chain
- *      @list: Pointer to root list pointer
+ *      @nh: Pointer to head of the atomic notifier chain
 *      @n: New entry in notifier chain
 *
- *      Adds a notifier to a notifier chain.
+ *      Adds a notifier to an atomic notifier chain.
 *
 *      Currently always returns zero.
 */
+int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+                struct notifier_block *n)
+{
+        unsigned long flags;
+        int ret;
+        spin_lock_irqsave(&nh->lock, flags);
+        ret = notifier_chain_register(&nh->head, n);
+        spin_unlock_irqrestore(&nh->lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
+/**
+ *      atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
+ *      @nh: Pointer to head of the atomic notifier chain
+ *      @n: Entry to remove from notifier chain
+ *
+ *      Removes a notifier from an atomic notifier chain.
+ *
+ *      Returns zero on success or %-ENOENT on failure.
+ */
+int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+                struct notifier_block *n)
+{
+        unsigned long flags;
+        int ret;
+        spin_lock_irqsave(&nh->lock, flags);
+        ret = notifier_chain_unregister(&nh->head, n);
+        spin_unlock_irqrestore(&nh->lock, flags);
+        synchronize_rcu();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
+/**
+ *      atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ *      @nh: Pointer to head of the atomic notifier chain
+ *      @val: Value passed unmodified to notifier function
+ *      @v: Pointer passed unmodified to notifier function
+ *
+ *      Calls each function in a notifier chain in turn.  The functions
+ *      run in an atomic context, so they must not block.
+ *      This routine uses RCU to synchronize with changes to the chain.
+ *
+ *      If the return value of the notifier can be and'ed
+ *      with %NOTIFY_STOP_MASK then atomic_notifier_call_chain
+ *      will return immediately, with the return value of
+ *      the notifier function which halted execution.
+ *      Otherwise the return value is the return value
+ *      of the last notifier function called.
+ */
 
-int notifier_chain_register(struct notifier_block **list, struct notifier_block *n)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+                unsigned long val, void *v)
 {
-        write_lock(&notifier_lock);
+        int ret;
-        while(*list)
-        {
+        rcu_read_lock();
-                if(n->priority > (*list)->priority)
+        ret = notifier_call_chain(&nh->head, val, v);
-                        break;
+        rcu_read_unlock();
-                list= &((*list)->next);
+        return ret;
-        }
-        n->next = *list;
-        *list=n;
-        write_unlock(&notifier_lock);
-        return 0;
 }
-EXPORT_SYMBOL(notifier_chain_register);
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+/*
+ *      Blocking notifier chain routines.  All access to the chain is
+ *      synchronized by an rwsem.
+ */
 /**
- *      notifier_chain_unregister - Remove notifier from a notifier chain
+ *      blocking_notifier_chain_register - Add notifier to a blocking notifier chain
- *      @nl: Pointer to root list pointer
+ *      @nh: Pointer to head of the blocking notifier chain
 *      @n: New entry in notifier chain
 *
- *      Removes a notifier from a notifier chain.
+ *      Adds a notifier to a blocking notifier chain.
+ *      Must be called in process context.
 *
- *      Returns zero on success, or %-ENOENT on failure.
+ *      Currently always returns zero.
 */
 
-int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n)
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+                struct notifier_block *n)
 {
-        write_lock(&notifier_lock);
+        int ret;
-        while((*nl)!=NULL)
-        {
+        /*
-                if((*nl)==n)
+         * This code gets used during boot-up, when task switching is
-                {
+         * not yet working and interrupts must remain disabled.  At
-                        *nl=n->next;
+         * such times we must not call down_write().
-                        write_unlock(&notifier_lock);
+         */
-                        return 0;
+        if (unlikely(system_state == SYSTEM_BOOTING))
-                }
+                return notifier_chain_register(&nh->head, n);
-                nl=&((*nl)->next);
-        }
+        down_write(&nh->rwsem);
-        write_unlock(&notifier_lock);
+        ret = notifier_chain_register(&nh->head, n);
-        return -ENOENT;
+        up_write(&nh->rwsem);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
+/**
+ *      blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
+ *      @nh: Pointer to head of the blocking notifier chain
+ *      @n: Entry to remove from notifier chain
+ *
+ *      Removes a notifier from a blocking notifier chain.
+ *      Must be called from process context.
+ *
+ *      Returns zero on success or %-ENOENT on failure.
+ */
+int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
+                struct notifier_block *n)
+{
+        int ret;
+        /*
+         * This code gets used during boot-up, when task switching is
+         * not yet working and interrupts must remain disabled.  At
+         * such times we must not call down_write().
+         */
+        if (unlikely(system_state == SYSTEM_BOOTING))
+                return notifier_chain_unregister(&nh->head, n);
+        down_write(&nh->rwsem);
+        ret = notifier_chain_unregister(&nh->head, n);
+        up_write(&nh->rwsem);
+        return ret;
 }
-EXPORT_SYMBOL(notifier_chain_unregister);
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
 /**
- *      notifier_call_chain - Call functions in a notifier chain
+ *      blocking_notifier_call_chain - Call functions in a blocking notifier chain
- *      @n: Pointer to root pointer of notifier chain
+ *      @nh: Pointer to head of the blocking notifier chain
 *      @val: Value passed unmodified to notifier function
 *      @v: Pointer passed unmodified to notifier function
 *
- *      Calls each function in a notifier chain in turn.
+ *      Calls each function in a notifier chain in turn.  The functions
+ *      run in a process context, so they are allowed to block.
 *
- *      If the return value of the notifier can be and'd
+ *      If the return value of the notifier can be and'ed
- *      with %NOTIFY_STOP_MASK, then notifier_call_chain
+ *      with %NOTIFY_STOP_MASK then blocking_notifier_call_chain
 *      will return immediately, with the return value of
 *      the notifier function which halted execution.
- *      Otherwise, the return value is the return value
+ *      Otherwise the return value is the return value
 *      of the last notifier function called.
 */
 
-int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+                unsigned long val, void *v)
 {
-        int ret=NOTIFY_DONE;
+        int ret;
-        struct notifier_block *nb = *n;
-        while(nb)
+        down_read(&nh->rwsem);
-        {
+        ret = notifier_call_chain(&nh->head, val, v);
-                ret=nb->notifier_call(nb,val,v);
+        up_read(&nh->rwsem);
-                if(ret&NOTIFY_STOP_MASK)
-                {
-                        return ret;
-                }
-                nb=nb->next;
-        }
        return ret;
 }
-EXPORT_SYMBOL(notifier_call_chain);
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
+/*
+ *      Raw notifier chain routines.  There is no protection;
+ *      the caller must provide it.  Use at your own risk!
+ */
+/**
+ *      raw_notifier_chain_register - Add notifier to a raw notifier chain
+ *      @nh: Pointer to head of the raw notifier chain
+ *      @n: New entry in notifier chain
+ *
+ *      Adds a notifier to a raw notifier chain.
+ *      All locking must be provided by the caller.
+ *
+ *      Currently always returns zero.
+ */
+int raw_notifier_chain_register(struct raw_notifier_head *nh,
+                struct notifier_block *n)
+{
+        return notifier_chain_register(&nh->head, n);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
+/**
+ *      raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
+ *      @nh: Pointer to head of the raw notifier chain
+ *      @n: Entry to remove from notifier chain
+ *
+ *      Removes a notifier from a raw notifier chain.
+ *      All locking must be provided by the caller.
+ *
+ *      Returns zero on success or %-ENOENT on failure.
+ */
+int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+                struct notifier_block *n)
+{
+        return notifier_chain_unregister(&nh->head, n);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
+/**
+ *      raw_notifier_call_chain - Call functions in a raw notifier chain
+ *      @nh: Pointer to head of the raw notifier chain
+ *      @val: Value passed unmodified to notifier function
+ *      @v: Pointer passed unmodified to notifier function
+ *
+ *      Calls each function in a notifier chain in turn.  The functions
+ *      run in an undefined context.
+ *      All locking must be provided by the caller.
+ *
+ *      If the return value of the notifier can be and'ed
+ *      with %NOTIFY_STOP_MASK then raw_notifier_call_chain
+ *      will return immediately, with the return value of
+ *      the notifier function which halted execution.
+ *      Otherwise the return value is the return value
+ *      of the last notifier function called.
+ */
+int raw_notifier_call_chain(struct raw_notifier_head *nh,
+                unsigned long val, void *v)
+{
+        return notifier_call_chain(&nh->head, val, v);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
 /**
 *      register_reboot_notifier - Register function to be called at reboot time
@@ -196,13 +401,13 @@ EXPORT_SYMBOL(notifier_call_chain);
 *      Registers a function with the list of functions
 *      to be called at reboot time.
 *
- *      Currently always returns zero, as notifier_chain_register
+ *      Currently always returns zero, as blocking_notifier_chain_register
 *      always returns zero.
 */
 
 int register_reboot_notifier(struct notifier_block * nb)
 {
-        return notifier_chain_register(&reboot_notifier_list, nb);
+        return blocking_notifier_chain_register(&reboot_notifier_list, nb);
 }
 EXPORT_SYMBOL(register_reboot_notifier);
@@ -219,23 +424,11 @@ EXPORT_SYMBOL(register_reboot_notifier);
 
 int unregister_reboot_notifier(struct notifier_block * nb)
 {
-        return notifier_chain_unregister(&reboot_notifier_list, nb);
+        return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
-#ifndef CONFIG_SECURITY
-int capable(int cap)
-{
-        if (cap_raised(current->cap_effective, cap)) {
-               current->flags |= PF_SUPERPRIV;
-               return 1;
-        }
-        return 0;
-}
-EXPORT_SYMBOL(capable);
-#endif
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
        int no_nice;
@@ -392,7 +585,7 @@ EXPORT_SYMBOL_GPL(emergency_restart);
 void kernel_restart_prepare(char *cmd)
 {
-        notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+        blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
        system_state = SYSTEM_RESTART;
        device_shutdown();
 }
@@ -442,7 +635,7 @@ EXPORT_SYMBOL_GPL(kernel_kexec);
 void kernel_shutdown_prepare(enum system_states state)
 {
-        notifier_call_chain(&reboot_notifier_list,
+        blocking_notifier_call_chain(&reboot_notifier_list,
                (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
        system_state = state;
        device_shutdown();
@@ -1009,69 +1202,24 @@ asmlinkage long sys_times(struct tms __user * tbuf)
         */
        if (tbuf) {
                struct tms tmp;
+                struct task_struct *tsk = current;
+                struct task_struct *t;
                cputime_t utime, stime, cutime, cstime;
-#ifdef CONFIG_SMP
+                spin_lock_irq(&tsk->sighand->siglock);
-                if (thread_group_empty(current)) {
+                utime = tsk->signal->utime;
-                        /*
+                stime = tsk->signal->stime;
-                         * Single thread case without the use of any locks.
+                t = tsk;
-                         *
+                do {
-                         * We may race with release_task if two threads are
+                        utime = cputime_add(utime, t->utime);
-                         * executing. However, release task first adds up the
+                        stime = cputime_add(stime, t->stime);
-                         * counters (__exit_signal) before  removing the task
+                        t = next_thread(t);
-                         * from the process tasklist (__unhash_process).
+                } while (t != tsk);
-                         * __exit_signal also acquires and releases the
-                         * siglock which results in the proper memory ordering
-                         * so that the list modifications are always visible
-                         * after the counters have been updated.
-                         *
-                         * If the counters have been updated by the second thread
-                         * but the thread has not yet been removed from the list
-                         * then the other branch will be executing which will
-                         * block on tasklist_lock until the exit handling of the
-                         * other task is finished.
-                         *
-                         * This also implies that the sighand->siglock cannot
-                         * be held by another processor. So we can also
-                         * skip acquiring that lock.
-                         */
-                        utime = cputime_add(current->signal->utime, current->utime);
-                        stime = cputime_add(current->signal->utime, current->stime);
-                        cutime = current->signal->cutime;
-                        cstime = current->signal->cstime;
-                } else
-#endif
-                {
-                        /* Process with multiple threads */
+                cutime = tsk->signal->cutime;
-                        struct task_struct *tsk = current;
+                cstime = tsk->signal->cstime;
-                        struct task_struct *t;
+                spin_unlock_irq(&tsk->sighand->siglock);
-                        read_lock(&tasklist_lock);
-                        utime = tsk->signal->utime;
-                        stime = tsk->signal->stime;
-                        t = tsk;
-                        do {
-                                utime = cputime_add(utime, t->utime);
-                                stime = cputime_add(stime, t->stime);
-                                t = next_thread(t);
-                        } while (t != tsk);
-                        /*
-                         * While we have tasklist_lock read-locked, no dying thread
-                         * can be updating current->signal->[us]time.  Instead,
-                         * we got their counts included in the live thread loop.
-                         * However, another thread can come in right now and
-                         * do a wait call that updates current->signal->c[us]time.
-                         * To make sure we always see that pair updated atomically,
-                         * we take the siglock around fetching them.
-                         */
-                        spin_lock_irq(&tsk->sighand->siglock);
-                        cutime = tsk->signal->cutime;
-                        cstime = tsk->signal->cstime;
-                        spin_unlock_irq(&tsk->sighand->siglock);
-                        read_unlock(&tasklist_lock);
-                }
                tmp.tms_utime = cputime_to_clock_t(utime);
                tmp.tms_stime = cputime_to_clock_t(stime);
                tmp.tms_cutime = cputime_to_clock_t(cutime);
@@ -1224,24 +1372,35 @@ asmlinkage long sys_getsid(pid_t pid)
 asmlinkage long sys_setsid(void)
 {
        struct task_struct *group_leader = current->group_leader;
-        struct pid *pid;
+        pid_t session;
        int err = -EPERM;
-        down(&tty_sem);
+        mutex_lock(&tty_mutex);
        write_lock_irq(&tasklist_lock);
-        pid = find_pid(PIDTYPE_PGID, group_leader->pid);
+        /* Fail if I am already a session leader */
-        if (pid)
+        if (group_leader->signal->leader)
+                goto out;
+        session = group_leader->pid;
+        /* Fail if a process group id already exists that equals the
+         * proposed session id.
+         *
+         * Don't check if session id == 1 because kernel threads use this
+         * session id and so the check will always fail and make it so
+         * init cannot successfully call setsid.
+         */
+        if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session))
                goto out;
        group_leader->signal->leader = 1;
-        __set_special_pids(group_leader->pid, group_leader->pid);
+        __set_special_pids(session, session);
        group_leader->signal->tty = NULL;
        group_leader->signal->tty_old_pgrp = 0;
        err = process_group(group_leader);
 out:
        write_unlock_irq(&tasklist_lock);
-        up(&tty_sem);
+        mutex_unlock(&tty_mutex);
        return err;
 }
@@ -1375,7 +1534,7 @@ static void groups_sort(struct group_info *group_info)
 /* a simple bsearch */
 int groups_search(struct group_info *group_info, gid_t grp)
 {
-        int left, right;
+        unsigned int left, right;
        if (!group_info)
                return 0;
@@ -1383,7 +1542,7 @@ int groups_search(struct group_info *group_info, gid_t grp)
        left = 0;
        right = group_info->ngroups;
        while (left < right) {
-                int mid = (left+right)/2;
+                unsigned int mid = (left+right)/2;
                int cmp = grp - GROUP_AT(group_info, mid);
                if (cmp > 0)
                        left = mid + 1;
@@ -1433,7 +1592,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
                return -EINVAL;
        /* no need to grab task_lock here; it cannot change */
-        get_group_info(current->group_info);
        i = current->group_info->ngroups;
        if (gidsetsize) {
                if (i > gidsetsize) {
@@ -1446,7 +1604,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
                }
        }
 out:
-        put_group_info(current->group_info);
        return i;
 }
@@ -1487,9 +1644,7 @@ int in_group_p(gid_t grp)
 {
        int retval = 1;
        if (grp != current->fsgid) {
-                get_group_info(current->group_info);
                retval = groups_search(current->group_info, grp);
-                put_group_info(current->group_info);
        }
        return retval;
 }
@@ -1500,9 +1655,7 @@ int in_egroup_p(gid_t grp)
 {
        int retval = 1;
        if (grp != current->egid) {
-                get_group_info(current->group_info);
                retval = groups_search(current->group_info, grp);
-                put_group_info(current->group_info);
        }
        return retval;
 }
@@ -1630,20 +1783,21 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
 asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 {
        struct rlimit new_rlim, *old_rlim;
+        unsigned long it_prof_secs;
        int retval;
        if (resource >= RLIM_NLIMITS)
                return -EINVAL;
-        if(copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+        if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
                return -EFAULT;
-       if (new_rlim.rlim_cur > new_rlim.rlim_max)
+        if (new_rlim.rlim_cur > new_rlim.rlim_max)
-               return -EINVAL;
+                return -EINVAL;
        old_rlim = current->signal->rlim + resource;
        if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
            !capable(CAP_SYS_RESOURCE))
                return -EPERM;
        if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
-                        return -EPERM;
+                return -EPERM;
        retval = security_task_setrlimit(resource, &new_rlim);
        if (retval)
@@ -1653,19 +1807,40 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
        *old_rlim = new_rlim;
        task_unlock(current->group_leader);
-        if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY &&
+        if (resource != RLIMIT_CPU)
-            (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
+                goto out;
-             new_rlim.rlim_cur <= cputime_to_secs(
-                     current->signal->it_prof_expires))) {
+        /*
-                cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur);
+         * RLIMIT_CPU handling.   Note that the kernel fails to return an error
+         * code if it rejected the user's attempt to set RLIMIT_CPU.  This is a
+         * very long-standing error, and fixing it now risks breakage of
+         * applications, so we live with it
+         */
+        if (new_rlim.rlim_cur == RLIM_INFINITY)
+                goto out;
+        it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
+        if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
+                unsigned long rlim_cur = new_rlim.rlim_cur;
+                cputime_t cputime;
+                if (rlim_cur == 0) {
+                        /*
+                         * The caller is asking for an immediate RLIMIT_CPU
+                         * expiry.  But we use the zero value to mean "it was
+                         * never set".  So let's cheat and make it one second
+                         * instead
+                         */
+                        rlim_cur = 1;
+                }
+                cputime = secs_to_cputime(rlim_cur);
                read_lock(&tasklist_lock);
                spin_lock_irq(&current->sighand->siglock);
-                set_process_cpu_timer(current, CPUCLOCK_PROF,
+                set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
-                                      &cputime, NULL);
                spin_unlock_irq(&current->sighand->siglock);
                read_unlock(&tasklist_lock);
        }
+out:
        return 0;
 }
@@ -1677,9 +1852,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 * a lot simpler!  (Which we're not doing right now because we're not
 * measuring them yet).
 *
- * This expects to be called with tasklist_lock read-locked or better,
- * and the siglock not locked.  It may momentarily take the siglock.
- *
 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
 * races with threads incrementing their own counters.  But since word
 * reads are atomic, we either get new values or old values and we don't
@@ -1687,6 +1859,25 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 * the c* fields from p->signal from races with exit.c updating those
 * fields when reaping, so a sample either gets all the additions of a
 * given child after it's reaped, or none so this sample is before reaping.
+ *
+ * tasklist_lock locking optimisation:
+ * If we are current and single threaded, we do not need to take the tasklist
+ * lock or the siglock.  No one else can take our signal_struct away,
+ * no one else can reap the children to update signal->c* counters, and
+ * no one else can race with the signal-> fields.
+ * If we do not take the tasklist_lock, the signal-> fields could be read
+ * out of order while another thread was just exiting. So we place a
+ * read memory barrier when we avoid the lock.  On the writer side,
+ * write memory barrier is implied in  __exit_signal as __exit_signal releases
+ * the siglock spinlock after updating the signal-> fields.
+ *
+ * We don't really need the siglock when we access the non c* fields
+ * of the signal_struct (for RUSAGE_SELF) even in multithreaded
+ * case, since we take the tasklist lock for read and the non c* signal->
+ * fields are updated only in __exit_signal, which is called with
+ * tasklist_lock taken for write, hence these two threads cannot execute
+ * concurrently.
+ *
 */
 static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
@@ -1694,13 +1885,23 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        struct task_struct *t;
        unsigned long flags;
        cputime_t utime, stime;
+        int need_lock = 0;
        memset((char *) r, 0, sizeof *r);
+        utime = stime = cputime_zero;
-        if (unlikely(!p->signal))
+        if (p != current || !thread_group_empty(p))
-                return;
+                need_lock = 1;
-        utime = stime = cputime_zero;
+        if (need_lock) {
+                read_lock(&tasklist_lock);
+                if (unlikely(!p->signal)) {
+                        read_unlock(&tasklist_lock);
+                        return;
+                }
+        } else
+                /* See locking comments above */
+                smp_rmb();
        switch (who) {
                case RUSAGE_BOTH:
@@ -1740,6 +1941,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                        BUG();
        }
+        if (need_lock)
+                read_unlock(&tasklist_lock);
        cputime_to_timeval(utime, &r->ru_utime);
        cputime_to_timeval(stime, &r->ru_stime);
 }
@@ -1747,9 +1950,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 {
        struct rusage r;
-        read_lock(&tasklist_lock);
        k_getrusage(p, who, &r);
-        read_unlock(&tasklist_lock);
        return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 17313b99e5..5433195040 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg);
 cond_syscall(sys_socketcall);
 cond_syscall(sys_futex);
 cond_syscall(compat_sys_futex);
+cond_syscall(sys_set_robust_list);
+cond_syscall(compat_sys_set_robust_list);
+cond_syscall(sys_get_robust_list);
+cond_syscall(compat_sys_get_robust_list);
 cond_syscall(sys_epoll_create);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
@@ -104,6 +108,8 @@ cond_syscall(sys_setreuid16);
 cond_syscall(sys_setuid16);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
+cond_syscall(compat_sys_ipc);
+cond_syscall(compat_sys_sysctl);
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
@@ -114,3 +120,15 @@ cond_syscall(sys32_sysctl);
 cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
+/* mmu depending weak syscall entries */
+cond_syscall(sys_mprotect);
+cond_syscall(sys_msync);
+cond_syscall(sys_mlock);
+cond_syscall(sys_munlock);
+cond_syscall(sys_mlockall);
+cond_syscall(sys_munlockall);
+cond_syscall(sys_mincore);
+cond_syscall(sys_madvise);
+cond_syscall(sys_mremap);
+cond_syscall(sys_remap_file_pages);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 71dd6f62ef..e82726faee 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -44,13 +44,14 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/nfs_fs.h>
+#include <linux/acpi.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
-#ifdef CONFIG_ROOT_NFS
+extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
-#include <linux/nfs_fs.h>
+                     void __user *buffer, size_t *lenp, loff_t *ppos);
-#endif
 #if defined(CONFIG_SYSCTL)
@@ -126,7 +127,9 @@ extern int sysctl_hz_timer;
 extern int acct_parm[];
 #endif
-int randomize_va_space = 1;
+#ifdef CONFIG_IA64
+extern int no_unaligned_warning;
+#endif
 static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
                       ctl_table *, void **);
@@ -640,6 +643,7 @@ static ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
+#if defined(CONFIG_MMU)
        {
                .ctl_name       = KERN_RANDOMIZE,
                .procname       = "randomize_va_space",
@@ -648,6 +652,7 @@ static ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
+#endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
        {
                .ctl_name       = KERN_SPIN_RETRY,
@@ -658,6 +663,26 @@ static ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
+#ifdef CONFIG_ACPI_SLEEP
+        {
+                .ctl_name       = KERN_ACPI_VIDEO_FLAGS,
+                .procname       = "acpi_video_flags",
+                .data           = &acpi_video_flags,
+                .maxlen         = sizeof (unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_minmax,
+        },
+#endif
+#ifdef CONFIG_IA64
+        {
+                .ctl_name       = KERN_IA64_UNALIGNED,
+                .procname       = "ignore-unaligned-usertrap",
+                .data           = &no_unaligned_warning,
+                .maxlen         = sizeof (int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+#endif
        { .ctl_name = 0 }
 };
@@ -717,18 +742,18 @@ static ctl_table vm_table[] = {
        {
                .ctl_name       = VM_DIRTY_WB_CS,
                .procname       = "dirty_writeback_centisecs",
-                .data           = &dirty_writeback_centisecs,
+                .data           = &dirty_writeback_interval,
-                .maxlen         = sizeof(dirty_writeback_centisecs),
+                .maxlen         = sizeof(dirty_writeback_interval),
                .mode           = 0644,
                .proc_handler   = &dirty_writeback_centisecs_handler,
        },
        {
                .ctl_name       = VM_DIRTY_EXPIRE_CS,
                .procname       = "dirty_expire_centisecs",
-                .data           = &dirty_expire_centisecs,
+                .data           = &dirty_expire_interval,
-                .maxlen         = sizeof(dirty_expire_centisecs),
+                .maxlen         = sizeof(dirty_expire_interval),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = &proc_dointvec_userhz_jiffies,
        },
        {
                .ctl_name       = VM_NR_PDFLUSH_THREADS,
@@ -823,9 +848,8 @@ static ctl_table vm_table[] = {
                .data           = &laptop_mode,
                .maxlen         = sizeof(laptop_mode),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = &proc_dointvec_jiffies,
-                .strategy       = &sysctl_intvec,
+                .strategy       = &sysctl_jiffies,
-                .extra1         = &zero,
        },
        {
                .ctl_name       = VM_BLOCK_DUMP,
@@ -921,7 +945,7 @@ static ctl_table fs_table[] = {
                .data           = &files_stat,
                .maxlen         = 3*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = &proc_nr_files,
        },
        {
                .ctl_name       = FS_MAXFILE,
@@ -2029,6 +2053,8 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
                                         int write, void *data)
 {
        if (write) {
+                if (*lvalp > LONG_MAX / HZ)
+                        return 1;
                *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
        } else {
                int val = *valp;
@@ -2050,6 +2076,8 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
                                                int write, void *data)
 {
        if (write) {
+                if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
+                        return 1;
                *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
        } else {
                int val = *valp;
diff --git a/kernel/time.c b/kernel/time.c
index 804539165d..b00ddc71ce 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -202,24 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
        return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
-long pps_offset;                /* pps time offset (us) */
-long pps_jitter = MAXTIME;      /* time dispersion (jitter) (us) */
-long pps_freq;                  /* frequency offset (scaled ppm) */
-long pps_stabil = MAXFREQ;      /* frequency dispersion (scaled ppm) */
-long pps_valid = PPS_VALID;     /* pps signal watchdog counter */
-int pps_shift = PPS_SHIFT;      /* interval duration (s) (shift) */
-long pps_jitcnt;                /* jitter limit exceeded */
-long pps_calcnt;                /* calibration intervals */
-long pps_errcnt;                /* calibration errors */
-long pps_stbcnt;                /* stability limit exceeded */
-/* hook for a loadable hardpps kernel module */
-void (*hardpps_ptr)(struct timeval *);
 /* we call this to notify the arch when the clock is being
 * controlled.  If no such arch routine, do nothing.
 */
@@ -279,7 +261,7 @@ int do_adjtimex(struct timex *txc)
                    result = -EINVAL;
                    goto leave;
                }
-                time_freq = txc->freq - pps_freq;
+                time_freq = txc->freq;
            }
            if (txc->modes & ADJ_MAXERROR) {
@@ -312,10 +294,8 @@ int do_adjtimex(struct timex *txc)
                    if ((time_next_adjust = txc->offset) == 0)
                         time_adjust = 0;
                }
-                else if ( time_status & (STA_PLL | STA_PPSTIME) ) {
+                else if (time_status & STA_PLL) {
-                    ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) ==
+                    ltemp = txc->offset;
-                            (STA_PPSTIME | STA_PPSSIGNAL) ?
-                            pps_offset : txc->offset;
                    /*
                     * Scale the phase adjustment and
@@ -356,23 +336,14 @@ int do_adjtimex(struct timex *txc)
                    }
                    time_freq = min(time_freq, time_tolerance);
                    time_freq = max(time_freq, -time_tolerance);
-                } /* STA_PLL || STA_PPSTIME */
+                } /* STA_PLL */
            } /* txc->modes & ADJ_OFFSET */
            if (txc->modes & ADJ_TICK) {
                tick_usec = txc->tick;
                tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
            }
        } /* txc->modes */
-leave:  if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
+leave:  if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
-            || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0
-                && (time_status & STA_PPSSIGNAL) == 0)
-            /* p. 24, (b) */
-            || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
-                == (STA_PPSTIME|STA_PPSJITTER))
-            /* p. 24, (c) */
-            || ((time_status & STA_PPSFREQ) != 0
-                && (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
-            /* p. 24, (d) */
                result = TIME_ERROR;
        
        if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
@@ -380,7 +351,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
        else {
            txc->offset = shift_right(time_offset, SHIFT_UPDATE);
        }
-        txc->freq          = time_freq + pps_freq;
+        txc->freq          = time_freq;
        txc->maxerror      = time_maxerror;
        txc->esterror      = time_esterror;
        txc->status        = time_status;
@@ -388,14 +359,16 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
        txc->precision     = time_precision;
        txc->tolerance     = time_tolerance;
        txc->tick          = tick_usec;
-        txc->ppsfreq       = pps_freq;
-        txc->jitter        = pps_jitter >> PPS_AVG;
+        /* PPS is not implemented, so these are zero */
-        txc->shift         = pps_shift;
+        txc->ppsfreq       = 0;
-        txc->stabil        = pps_stabil;
+        txc->jitter        = 0;
-        txc->jitcnt        = pps_jitcnt;
+        txc->shift         = 0;
-        txc->calcnt        = pps_calcnt;
+        txc->stabil        = 0;
-        txc->errcnt        = pps_errcnt;
+        txc->jitcnt        = 0;
-        txc->stbcnt        = pps_stbcnt;
+        txc->calcnt        = 0;
+        txc->errcnt        = 0;
+        txc->stbcnt        = 0;
        write_sequnlock_irq(&xtime_lock);
        do_gettimeofday(&txc->time);
        notify_arch_cmos_timer();
@@ -437,7 +410,7 @@ EXPORT_SYMBOL(current_kernel_time);
 * current_fs_time - Return FS time
 * @sb: Superblock.
 *
- * Return the current time truncated to the time granuality supported by
+ * Return the current time truncated to the time granularity supported by
 * the fs.
 */
 struct timespec current_fs_time(struct super_block *sb)
@@ -448,11 +421,11 @@ struct timespec current_fs_time(struct super_block *sb)
 EXPORT_SYMBOL(current_fs_time);
 /**
- * timespec_trunc - Truncate timespec to a granuality
+ * timespec_trunc - Truncate timespec to a granularity
 * @t: Timespec
- * @gran: Granuality in ns.
+ * @gran: Granularity in ns.
 *
- * Truncate a timespec to a granuality. gran must be smaller than a second.
+ * Truncate a timespec to a granularity. gran must be smaller than a second.
 * Always rounds down.
 *
 * This function should be only used for timestamps returned by
@@ -637,7 +610,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
 *
 * Returns the timespec representation of the nsec parameter.
 */
-struct timespec ns_to_timespec(const nsec_t nsec)
+struct timespec ns_to_timespec(const s64 nsec)
 {
        struct timespec ts;
@@ -657,7 +630,7 @@ struct timespec ns_to_timespec(const nsec_t nsec)
 *
 * Returns the timeval representation of the nsec parameter.
 */
-struct timeval ns_to_timeval(const nsec_t nsec)
+struct timeval ns_to_timeval(const s64 nsec)
 {
        struct timespec ts = ns_to_timespec(nsec);
        struct timeval tv;
diff --git a/kernel/timer.c b/kernel/timer.c
index b9dad39946..9e49deed46 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(jiffies_64);
 /*
 * per-CPU timer vector definitions:
 */
 #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
 #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
 #define TVN_SIZE (1 << TVN_BITS)
@@ -62,11 +61,6 @@ EXPORT_SYMBOL(jiffies_64);
 #define TVN_MASK (TVN_SIZE - 1)
 #define TVR_MASK (TVR_SIZE - 1)
-struct timer_base_s {
-        spinlock_t lock;
-        struct timer_list *running_timer;
-};
 typedef struct tvec_s {
        struct list_head vec[TVN_SIZE];
 } tvec_t;
@@ -76,7 +70,8 @@ typedef struct tvec_root_s {
 } tvec_root_t;
 struct tvec_t_base_s {
-        struct timer_base_s t_base;
+        spinlock_t lock;
+        struct timer_list *running_timer;
        unsigned long timer_jiffies;
        tvec_root_t tv1;
        tvec_t tv2;
@@ -86,13 +81,16 @@ struct tvec_t_base_s {
 } ____cacheline_aligned_in_smp;
 typedef struct tvec_t_base_s tvec_base_t;
-static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
+tvec_base_t boot_tvec_bases;
+EXPORT_SYMBOL(boot_tvec_bases);
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
 static inline void set_running_timer(tvec_base_t *base,
                                        struct timer_list *timer)
 {
 #ifdef CONFIG_SMP
-        base->t_base.running_timer = timer;
+        base->running_timer = timer;
 #endif
 }
@@ -138,15 +136,6 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
        list_add_tail(&timer->entry, vec);
 }
-typedef struct timer_base_s timer_base_t;
-/*
- * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
- * at compile time, and we need timer->base to lock the timer.
- */
-timer_base_t __init_timer_base
-        ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
-EXPORT_SYMBOL(__init_timer_base);
 /***
 * init_timer - initialize a timer.
 * @timer: the timer to be initialized
@@ -157,7 +146,7 @@ EXPORT_SYMBOL(__init_timer_base);
 void fastcall init_timer(struct timer_list *timer)
 {
        timer->entry.next = NULL;
-        timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
+        timer->base = per_cpu(tvec_bases, raw_smp_processor_id());
 }
 EXPORT_SYMBOL(init_timer);
@@ -173,7 +162,7 @@ static inline void detach_timer(struct timer_list *timer,
 }
 /*
- * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
+ * We are using hashed locking: holding per_cpu(tvec_bases).lock
 * means that all timers which are tied to this base via timer->base are
 * locked, and the base itself is locked too.
 *
@@ -184,10 +173,10 @@ static inline void detach_timer(struct timer_list *timer,
 * possible to set timer->base = NULL and drop the lock: the timer remains
 * locked.
 */
-static timer_base_t *lock_timer_base(struct timer_list *timer,
+static tvec_base_t *lock_timer_base(struct timer_list *timer,
                                        unsigned long *flags)
 {
-        timer_base_t *base;
+        tvec_base_t *base;
        for (;;) {
                base = timer->base;
@@ -204,8 +193,7 @@ static timer_base_t *lock_timer_base(struct timer_list *timer,
 int __mod_timer(struct timer_list *timer, unsigned long expires)
 {
-        timer_base_t *base;
+        tvec_base_t *base, *new_base;
-        tvec_base_t *new_base;
        unsigned long flags;
        int ret = 0;
@@ -218,9 +206,9 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
                ret = 1;
        }
-        new_base = &__get_cpu_var(tvec_bases);
+        new_base = __get_cpu_var(tvec_bases);
-        if (base != &new_base->t_base) {
+        if (base != new_base) {
                /*
                 * We are trying to schedule the timer on the local CPU.
                 * However we can't change timer's base while it is running,
@@ -228,21 +216,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
                 * handler yet has not finished. This also guarantees that
                 * the timer is serialized wrt itself.
                 */
-                if (unlikely(base->running_timer == timer)) {
+                if (likely(base->running_timer != timer)) {
-                        /* The timer remains on a former base */
-                        new_base = container_of(base, tvec_base_t, t_base);
-                } else {
                        /* See the comment in lock_timer_base() */
                        timer->base = NULL;
                        spin_unlock(&base->lock);
-                        spin_lock(&new_base->t_base.lock);
+                        base = new_base;
-                        timer->base = &new_base->t_base;
+                        spin_lock(&base->lock);
+                        timer->base = base;
                }
        }
        timer->expires = expires;
-        internal_add_timer(new_base, timer);
+        internal_add_timer(base, timer);
-        spin_unlock_irqrestore(&new_base->t_base.lock, flags);
+        spin_unlock_irqrestore(&base->lock, flags);
        return ret;
 }
@@ -258,14 +244,14 @@ EXPORT_SYMBOL(__mod_timer);
 */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-        tvec_base_t *base = &per_cpu(tvec_bases, cpu);
+        tvec_base_t *base = per_cpu(tvec_bases, cpu);
        unsigned long flags;
        BUG_ON(timer_pending(timer) || !timer->function);
-        spin_lock_irqsave(&base->t_base.lock, flags);
+        spin_lock_irqsave(&base->lock, flags);
-        timer->base = &base->t_base;
+        timer->base = base;
        internal_add_timer(base, timer);
-        spin_unlock_irqrestore(&base->t_base.lock, flags);
+        spin_unlock_irqrestore(&base->lock, flags);
 }
@@ -318,7 +304,7 @@ EXPORT_SYMBOL(mod_timer);
 */
 int del_timer(struct timer_list *timer)
 {
-        timer_base_t *base;
+        tvec_base_t *base;
        unsigned long flags;
        int ret = 0;
@@ -345,7 +331,7 @@ EXPORT_SYMBOL(del_timer);
 */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
-        timer_base_t *base;
+        tvec_base_t *base;
        unsigned long flags;
        int ret = -1;
@@ -409,7 +395,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
                struct timer_list *tmp;
                tmp = list_entry(curr, struct timer_list, entry);
-                BUG_ON(tmp->base != &base->t_base);
+                BUG_ON(tmp->base != base);
                curr = curr->next;
                internal_add_timer(base, tmp);
        }
@@ -431,7 +417,7 @@ static inline void __run_timers(tvec_base_t *base)
 {
        struct timer_list *timer;
-        spin_lock_irq(&base->t_base.lock);
+        spin_lock_irq(&base->lock);
        while (time_after_eq(jiffies, base->timer_jiffies)) {
                struct list_head work_list = LIST_HEAD_INIT(work_list);
                struct list_head *head = &work_list;
@@ -457,7 +443,7 @@ static inline void __run_timers(tvec_base_t *base)
                        set_running_timer(base, timer);
                        detach_timer(timer, 1);
-                        spin_unlock_irq(&base->t_base.lock);
+                        spin_unlock_irq(&base->lock);
                        {
                                int preempt_count = preempt_count();
                                fn(data);
@@ -470,11 +456,11 @@ static inline void __run_timers(tvec_base_t *base)
                                        BUG();
                                }
                        }
-                        spin_lock_irq(&base->t_base.lock);
+                        spin_lock_irq(&base->lock);
                }
        }
        set_running_timer(base, NULL);
-        spin_unlock_irq(&base->t_base.lock);
+        spin_unlock_irq(&base->lock);
 }
 #ifdef CONFIG_NO_IDLE_HZ
@@ -489,11 +475,23 @@ unsigned long next_timer_interrupt(void)
        struct list_head *list;
        struct timer_list *nte;
        unsigned long expires;
+        unsigned long hr_expires = MAX_JIFFY_OFFSET;
+        ktime_t hr_delta;
        tvec_t *varray[4];
        int i, j;
-        base = &__get_cpu_var(tvec_bases);
+        hr_delta = hrtimer_get_next_event();
-        spin_lock(&base->t_base.lock);
+        if (hr_delta.tv64 != KTIME_MAX) {
+                struct timespec tsdelta;
+                tsdelta = ktime_to_timespec(hr_delta);
+                hr_expires = timespec_to_jiffies(&tsdelta);
+                if (hr_expires < 3)
+                        return hr_expires + jiffies;
+        }
+        hr_expires += jiffies;
+        base = __get_cpu_var(tvec_bases);
+        spin_lock(&base->lock);
        expires = base->timer_jiffies + (LONG_MAX >> 1);
        list = NULL;
@@ -541,7 +539,27 @@ found:
                                expires = nte->expires;
                }
        }
-        spin_unlock(&base->t_base.lock);
+        spin_unlock(&base->lock);
+        /*
+         * It can happen that other CPUs service timer IRQs and increment
+         * jiffies, but we have not yet got a local timer tick to process
+         * the timer wheels.  In that case, the expiry time can be before
+         * jiffies, but since the high-resolution timer here is relative to
+         * jiffies, the default expression when high-resolution timers are
+         * not active,
+         *
+         *   time_before(MAX_JIFFY_OFFSET + jiffies, expires)
+         *
+         * would falsely evaluate to true.  If that is the case, just
+         * return jiffies so that we can immediately fire the local timer
+         */
+        if (time_before(expires, jiffies))
+                return jiffies;
+        if (time_before(hr_expires, expires))
+                return hr_expires;
        return expires;
 }
 #endif
@@ -680,18 +698,9 @@ static void second_overflow(void)
        /*
         * Compute the frequency estimate and additional phase adjustment due
-         * to frequency error for the next second. When the PPS signal is
+         * to frequency error for the next second.
-         * engaged, gnaw on the watchdog counter and update the frequency
-         * computed by the pll and the PPS signal.
         */
-        pps_valid++;
+        ltemp = time_freq;
-        if (pps_valid == PPS_VALID) {   /* PPS signal lost */
-                pps_jitter = MAXTIME;
-                pps_stabil = MAXFREQ;
-                time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
-                                STA_PPSWANDER | STA_PPSERROR);
-        }
-        ltemp = time_freq + pps_freq;
        time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 #if HZ == 100
@@ -717,12 +726,16 @@ static void second_overflow(void)
 #endif
 }
-/* in the NTP reference this is called "hardclock()" */
+/*
-static void update_wall_time_one_tick(void)
+ * Returns how many microseconds we need to add to xtime this tick
+ * in doing an adjustment requested with adjtime.
+ */
+static long adjtime_adjustment(void)
 {
-        long time_adjust_step, delta_nsec;
+        long time_adjust_step;
-        if ((time_adjust_step = time_adjust) != 0 ) {
+        time_adjust_step = time_adjust;
+        if (time_adjust_step) {
                /*
                 * We are doing an adjtime thing.  Prepare time_adjust_step to
                 * be within bounds.  Note that a positive time_adjust means we
@@ -733,10 +746,19 @@ static void update_wall_time_one_tick(void)
                 */
                time_adjust_step = min(time_adjust_step, (long)tickadj);
                time_adjust_step = max(time_adjust_step, (long)-tickadj);
+        }
+        return time_adjust_step;
+}
+/* in the NTP reference this is called "hardclock()" */
+static void update_wall_time_one_tick(void)
+{
+        long time_adjust_step, delta_nsec;
+        time_adjust_step = adjtime_adjustment();
+        if (time_adjust_step)
                /* Reduce by this step the amount of time left  */
                time_adjust -= time_adjust_step;
-        }
        delta_nsec = tick_nsec + time_adjust_step * 1000;
        /*
         * Advance the phase, once it gets to one microsecond, then
@@ -759,6 +781,22 @@ static void update_wall_time_one_tick(void)
 }
 /*
+ * Return how long ticks are at the moment, that is, how much time
+ * update_wall_time_one_tick will add to xtime next time we call it
+ * (assuming no calls to do_adjtimex in the meantime).
+ * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10
+ * bits to the right of the binary point.
+ * This function has no side-effects.
+ */
+u64 current_tick_length(void)
+{
+        long delta_nsec;
+        delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
+        return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj;
+}
+/*
 * Using a loop looks inefficient, but "ticks" is
 * usually just one (we shouldn't be losing ticks,
 * we're doing this this way mainly for interrupt
@@ -804,7 +842,7 @@ void update_process_times(int user_tick)
 */
 static unsigned long count_active_tasks(void)
 {
-        return (nr_running() + nr_uninterruptible()) * FIXED_1;
+        return nr_active() * FIXED_1;
 }
 /*
@@ -856,7 +894,7 @@ EXPORT_SYMBOL(xtime_lock);
 */
 static void run_timer_softirq(struct softirq_action *h)
 {
-        tvec_base_t *base = &__get_cpu_var(tvec_bases);
+        tvec_base_t *base = __get_cpu_var(tvec_bases);
        hrtimer_run_queues();
        if (time_after_eq(jiffies, base->timer_jiffies))
@@ -869,6 +907,7 @@ static void run_timer_softirq(struct softirq_action *h)
 void run_local_timers(void)
 {
        raise_softirq(TIMER_SOFTIRQ);
+        softlockup_tick();
 }
 /*
@@ -896,8 +935,9 @@ static inline void update_times(void)
 void do_timer(struct pt_regs *regs)
 {
        jiffies_64++;
+        /* prevent loading jiffies before storing new jiffies_64 value. */
+        barrier();
        update_times();
-        softlockup_tick(regs);
 }
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -908,19 +948,7 @@ void do_timer(struct pt_regs *regs)
 */
 asmlinkage unsigned long sys_alarm(unsigned int seconds)
 {
-        struct itimerval it_new, it_old;
+        return alarm_setitimer(seconds);
-        unsigned int oldalarm;
-        it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
-        it_new.it_value.tv_sec = seconds;
-        it_new.it_value.tv_usec = 0;
-        do_setitimer(ITIMER_REAL, &it_new, &it_old);
-        oldalarm = it_old.it_value.tv_sec;
-        /* ehhh.. We can't return 0 if we have an alarm pending.. */
-        /* And we'd better return too much than too little anyway */
-        if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
-                oldalarm++;
-        return oldalarm;
 }
 #endif
@@ -1209,13 +1237,41 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
        return 0;
 }
-static void __devinit init_timers_cpu(int cpu)
+static int __devinit init_timers_cpu(int cpu)
 {
        int j;
        tvec_base_t *base;
+        static char __devinitdata tvec_base_done[NR_CPUS];
+        if (!tvec_base_done[cpu]) {
+                static char boot_done;
-        base = &per_cpu(tvec_bases, cpu);
+                if (boot_done) {
-        spin_lock_init(&base->t_base.lock);
+                        /*
+                         * The APs use this path later in boot
+                         */
+                        base = kmalloc_node(sizeof(*base), GFP_KERNEL,
+                                                cpu_to_node(cpu));
+                        if (!base)
+                                return -ENOMEM;
+                        memset(base, 0, sizeof(*base));
+                        per_cpu(tvec_bases, cpu) = base;
+                } else {
+                        /*
+                         * This is for the boot CPU - we use compile-time
+                         * static initialisation because per-cpu memory isn't
+                         * ready yet and because the memory allocators are not
+                         * initialised either.
+                         */
+                        boot_done = 1;
+                        base = &boot_tvec_bases;
+                }
+                tvec_base_done[cpu] = 1;
+        } else {
+                base = per_cpu(tvec_bases, cpu);
+        }
+        spin_lock_init(&base->lock);
        for (j = 0; j < TVN_SIZE; j++) {
                INIT_LIST_HEAD(base->tv5.vec + j);
                INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1226,6 +1282,7 @@ static void __devinit init_timers_cpu(int cpu)
                INIT_LIST_HEAD(base->tv1.vec + j);
        base->timer_jiffies = jiffies;
+        return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1236,7 +1293,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
        while (!list_empty(head)) {
                timer = list_entry(head->next, struct timer_list, entry);
                detach_timer(timer, 0);
-                timer->base = &new_base->t_base;
+                timer->base = new_base;
                internal_add_timer(new_base, timer);
        }
 }
@@ -1248,15 +1305,15 @@ static void __devinit migrate_timers(int cpu)
        int i;
        BUG_ON(cpu_online(cpu));
-        old_base = &per_cpu(tvec_bases, cpu);
+        old_base = per_cpu(tvec_bases, cpu);
-        new_base = &get_cpu_var(tvec_bases);
+        new_base = get_cpu_var(tvec_bases);
        local_irq_disable();
-        spin_lock(&new_base->t_base.lock);
+        spin_lock(&new_base->lock);
-        spin_lock(&old_base->t_base.lock);
+        spin_lock(&old_base->lock);
+        BUG_ON(old_base->running_timer);
-        if (old_base->t_base.running_timer)
-                BUG();
        for (i = 0; i < TVR_SIZE; i++)
                migrate_timer_list(new_base, old_base->tv1.vec + i);
        for (i = 0; i < TVN_SIZE; i++) {
@@ -1266,20 +1323,21 @@ static void __devinit migrate_timers(int cpu)
                migrate_timer_list(new_base, old_base->tv5.vec + i);
        }
-        spin_unlock(&old_base->t_base.lock);
+        spin_unlock(&old_base->lock);
-        spin_unlock(&new_base->t_base.lock);
+        spin_unlock(&new_base->lock);
        local_irq_enable();
        put_cpu_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit timer_cpu_notify(struct notifier_block *self, 
+static int timer_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
        switch(action) {
        case CPU_UP_PREPARE:
-                init_timers_cpu(cpu);
+                if (init_timers_cpu(cpu) < 0)
+                        return NOTIFY_BAD;
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_DEAD:
@@ -1292,7 +1350,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __devinitdata timers_nb = {
+static struct notifier_block timers_nb = {
        .notifier_call  = timer_cpu_notify,
 };
@@ -1307,8 +1365,8 @@ void __init init_timers(void)
 #ifdef CONFIG_TIME_INTERPOLATION
-struct time_interpolator *time_interpolator;
+struct time_interpolator *time_interpolator __read_mostly;
-static struct time_interpolator *time_interpolator_list;
+static struct time_interpolator *time_interpolator_list __read_mostly;
 static DEFINE_SPINLOCK(time_interpolator_lock);
 static inline u64 time_interpolator_get_cycles(unsigned int src)
@@ -1322,10 +1380,10 @@ static inline u64 time_interpolator_get_cycles(unsigned int src)
                        return x();
                case TIME_SOURCE_MMIO64 :
-                        return readq((void __iomem *) time_interpolator->addr);
+                        return readq_relaxed((void __iomem *)time_interpolator->addr);
                case TIME_SOURCE_MMIO32 :
-                        return readl((void __iomem *) time_interpolator->addr);
+                        return readl_relaxed((void __iomem *)time_interpolator->addr);
                default: return get_cycles();
        }
@@ -1422,7 +1480,7 @@ static void time_interpolator_update(long delta_nsec)
         */
        if (jiffies % INTERPOLATOR_ADJUST == 0)
        {
-                if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
+                if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec)
                        time_interpolator->nsec_per_cyc--;
                if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
                        time_interpolator->nsec_per_cyc++;
@@ -1446,8 +1504,7 @@ register_time_interpolator(struct time_interpolator *ti)
        unsigned long flags;
        /* Sanity check */
-        if (ti->frequency == 0 || ti->mask == 0)
+        BUG_ON(ti->frequency == 0 || ti->mask == 0);
-                BUG();
        ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
        spin_lock(&time_interpolator_lock);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index aa25605027..187e2a4238 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -20,43 +20,67 @@
 asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group)
 {
-        return sys_chown(filename, low2highuid(user), low2highgid(group));
+        long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group)
 {
-        return sys_lchown(filename, low2highuid(user), low2highgid(group));
+        long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
 {
-        return sys_fchown(fd, low2highuid(user), low2highgid(group));
+        long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
 {
-        return sys_setregid(low2highgid(rgid), low2highgid(egid));
+        long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_setgid16(old_gid_t gid)
 {
-        return sys_setgid(low2highgid(gid));
+        long ret = sys_setgid(low2highgid(gid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
 {
-        return sys_setreuid(low2highuid(ruid), low2highuid(euid));
+        long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_setuid16(old_uid_t uid)
 {
-        return sys_setuid(low2highuid(uid));
+        long ret = sys_setuid(low2highuid(uid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
 {
-        return sys_setresuid(low2highuid(ruid), low2highuid(euid),
+        long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
-                low2highuid(suid));
+                                 low2highuid(suid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
@@ -72,8 +96,11 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid,
 asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
 {
-        return sys_setresgid(low2highgid(rgid), low2highgid(egid),
+        long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
-                low2highgid(sgid));
+                                 low2highgid(sgid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
@@ -89,12 +116,18 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid,
 asmlinkage long sys_setfsuid16(old_uid_t uid)
 {
-        return sys_setfsuid(low2highuid(uid));
+        long ret = sys_setfsuid(low2highuid(uid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 asmlinkage long sys_setfsgid16(old_gid_t gid)
 {
-        return sys_setfsgid(low2highgid(gid));
+        long ret = sys_setfsgid(low2highgid(gid));
+        /* avoid REGPARM breakage on x86: */
+        prevent_tail_call(ret);
+        return ret;
 }
 static int groups16_to_user(old_gid_t __user *grouplist,
diff --git a/kernel/user.c b/kernel/user.c
index d9deae43a9..2116642f42 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -105,15 +105,19 @@ void free_uid(struct user_struct *up)
 {
        unsigned long flags;
+        if (!up)
+                return;
        local_irq_save(flags);
-        if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+        if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
                uid_hash_remove(up);
+                spin_unlock_irqrestore(&uidhash_lock, flags);
                key_put(up->uid_keyring);
                key_put(up->session_keyring);
                kmem_cache_free(uid_cachep, up);
-                spin_unlock(&uidhash_lock);
+        } else {
+                local_irq_restore(flags);
        }
-        local_irq_restore(flags);
 }
 struct user_struct * alloc_uid(uid_t uid)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b052e2c4c7..880fb415a8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -27,6 +27,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/kthread.h>
+#include <linux/hardirq.h>
 /*
 * The per-CPU workqueue (if single thread, we always use the first
@@ -476,6 +477,34 @@ void cancel_rearming_delayed_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_work);
+/**
+ * execute_in_process_context - reliably execute the routine with user context
+ * @fn:         the function to execute
+ * @data:       data to pass to the function
+ * @ew:         guaranteed storage for the execute work structure (must
+ *              be available when the work executes)
+ *
+ * Executes the function immediately if process context is available,
+ * otherwise schedules the function for delayed execution.
+ *
+ * Returns:     0 - function was executed
+ *              1 - function was scheduled for execution
+ */
+int execute_in_process_context(void (*fn)(void *data), void *data,
+                               struct execute_work *ew)
+{
+        if (!in_interrupt()) {
+                fn(data);
+                return 0;
+        }
+        INIT_WORK(&ew->work, fn, data);
+        schedule_work(&ew->work);
+        return 1;
+}
+EXPORT_SYMBOL_GPL(execute_in_process_context);
 int keventd_up(void)
 {
        return keventd_wq != NULL;
@@ -518,7 +547,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 }
 /* We're holding the cpucontrol mutex here */
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
+static int workqueue_cpu_callback(struct notifier_block *nfb,
                                  unsigned long action,
                                  void *hcpu)
 {