53 files changed, 3997 insertions, 900 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 58908f9d156a..752bd7d383af 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,6 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o
+obj-y += time/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_FUTEX) += futex.o
 ifeq ($(CONFIG_COMPAT),y)
@@ -20,8 +21,8 @@ obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_STACK_UNWIND) += unwind.o
 obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
diff --git a/kernel/acct.c b/kernel/acct.c
index b327f4d20104..368c4f03fe0e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,7 +75,7 @@ int acct_parm[3] = {4, 2, 30};
 /*
 * External references and all of the globals.
 */
-static void do_acct_process(long, struct file *);
+static void do_acct_process(struct file *);
 /*
 * This structure is used so that all the data protected by lock
@@ -118,7 +118,7 @@ static int check_free_space(struct file *file)
        spin_unlock(&acct_globals.lock);
        /* May block */
-        if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf))
+        if (vfs_statfs(file->f_dentry, &sbuf))
                return res;
        suspend = sbuf.f_blocks * SUSPEND;
        resume = sbuf.f_blocks * RESUME;
@@ -196,7 +196,7 @@ static void acct_file_reopen(struct file *file)
        if (old_acct) {
                mnt_unpin(old_acct->f_vfsmnt);
                spin_unlock(&acct_globals.lock);
-                do_acct_process(0, old_acct);
+                do_acct_process(old_acct);
                filp_close(old_acct, NULL);
                spin_lock(&acct_globals.lock);
        }
@@ -419,16 +419,15 @@ static u32 encode_float(u64 value)
 /*
 *  do_acct_process does all actual work. Caller holds the reference to file.
 */
-static void do_acct_process(long exitcode, struct file *file)
+static void do_acct_process(struct file *file)
 {
+        struct pacct_struct *pacct = &current->signal->pacct;
        acct_t ac;
        mm_segment_t fs;
-        unsigned long vsize;
        unsigned long flim;
        u64 elapsed;
        u64 run_time;
        struct timespec uptime;
-        unsigned long jiffies;
        /*
         * First check to see if there is enough free_space to continue
@@ -469,12 +468,6 @@ static void do_acct_process(long exitcode, struct file *file)
 #endif
        do_div(elapsed, AHZ);
        ac.ac_btime = xtime.tv_sec - elapsed;
-        jiffies = cputime_to_jiffies(cputime_add(current->utime,
-                                                 current->signal->utime));
-        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
-        jiffies = cputime_to_jiffies(cputime_add(current->stime,
-                                                 current->signal->stime));
-        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
        /* we really need to bite the bullet and change layout */
        ac.ac_uid = current->uid;
        ac.ac_gid = current->gid;
@@ -496,37 +489,18 @@ static void do_acct_process(long exitcode, struct file *file)
                old_encode_dev(tty_devnum(current->signal->tty)) : 0;
        read_unlock(&tasklist_lock);
-        ac.ac_flag = 0;
+        spin_lock(&current->sighand->siglock);
-        if (current->flags & PF_FORKNOEXEC)
+        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
-                ac.ac_flag |= AFORK;
+        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
-        if (current->flags & PF_SUPERPRIV)
+        ac.ac_flag = pacct->ac_flag;
-                ac.ac_flag |= ASU;
+        ac.ac_mem = encode_comp_t(pacct->ac_mem);
-        if (current->flags & PF_DUMPCORE)
+        ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
-                ac.ac_flag |= ACORE;
+        ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
-        if (current->flags & PF_SIGNALED)
+        ac.ac_exitcode = pacct->ac_exitcode;
-                ac.ac_flag |= AXSIG;
+        spin_unlock(&current->sighand->siglock);
-        vsize = 0;
-        if (current->mm) {
-                struct vm_area_struct *vma;
-                down_read(&current->mm->mmap_sem);
-                vma = current->mm->mmap;
-                while (vma) {
-                        vsize += vma->vm_end - vma->vm_start;
-                        vma = vma->vm_next;
-                }
-                up_read(&current->mm->mmap_sem);
-        }
-        vsize = vsize / 1024;
-        ac.ac_mem = encode_comp_t(vsize);
        ac.ac_io = encode_comp_t(0 /* current->io_usage */);    /* %% */
        ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
-        ac.ac_minflt = encode_comp_t(current->signal->min_flt +
-                                     current->min_flt);
-        ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
-                                     current->maj_flt);
        ac.ac_swaps = encode_comp_t(0);
-        ac.ac_exitcode = exitcode;
        /*
         * Kernel segment override to datasegment and write it
@@ -546,12 +520,63 @@ static void do_acct_process(long exitcode, struct file *file)
 }
 /**
+ * acct_init_pacct - initialize a new pacct_struct
+ */
+void acct_init_pacct(struct pacct_struct *pacct)
+{
+        memset(pacct, 0, sizeof(struct pacct_struct));
+        pacct->ac_utime = pacct->ac_stime = cputime_zero;
+}
+/**
+ * acct_collect - collect accounting information into pacct_struct
+ * @exitcode: task exit code
+ * @group_dead: not 0, if this thread is the last one in the process.
+ */
+void acct_collect(long exitcode, int group_dead)
+{
+        struct pacct_struct *pacct = &current->signal->pacct;
+        unsigned long vsize = 0;
+        if (group_dead && current->mm) {
+                struct vm_area_struct *vma;
+                down_read(&current->mm->mmap_sem);
+                vma = current->mm->mmap;
+                while (vma) {
+                        vsize += vma->vm_end - vma->vm_start;
+                        vma = vma->vm_next;
+                }
+                up_read(&current->mm->mmap_sem);
+        }
+        spin_lock_irq(&current->sighand->siglock);
+        if (group_dead)
+                pacct->ac_mem = vsize / 1024;
+        if (thread_group_leader(current)) {
+                pacct->ac_exitcode = exitcode;
+                if (current->flags & PF_FORKNOEXEC)
+                        pacct->ac_flag |= AFORK;
+        }
+        if (current->flags & PF_SUPERPRIV)
+                pacct->ac_flag |= ASU;
+        if (current->flags & PF_DUMPCORE)
+                pacct->ac_flag |= ACORE;
+        if (current->flags & PF_SIGNALED)
+                pacct->ac_flag |= AXSIG;
+        pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
+        pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
+        pacct->ac_minflt += current->min_flt;
+        pacct->ac_majflt += current->maj_flt;
+        spin_unlock_irq(&current->sighand->siglock);
+}
+/**
 * acct_process - now just a wrapper around do_acct_process
 * @exitcode: task exit code
 *
 * handles process accounting for an exiting task
 */
-void acct_process(long exitcode)
+void acct_process()
 {
        struct file *file = NULL;
@@ -570,7 +595,7 @@ void acct_process(long exitcode)
        get_file(file);
        spin_unlock(&acct_globals.lock);
-        do_acct_process(exitcode, file);
+        do_acct_process(file);
        fput(file);
 }
@@ -599,9 +624,7 @@ void acct_update_integrals(struct task_struct *tsk)
 */
 void acct_clear_integrals(struct task_struct *tsk)
 {
-        if (tsk) {
+        tsk->acct_stimexpd = 0;
-                tsk->acct_stimexpd = 0;
+        tsk->acct_rss_mem1 = 0;
-                tsk->acct_rss_mem1 = 0;
+        tsk->acct_vm_mem1 = 0;
-                tsk->acct_vm_mem1 = 0;
-        }
 }
diff --git a/kernel/audit.c b/kernel/audit.c
index df57b493e1cb..7dfac7031bd7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,6 +56,7 @@
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/selinux.h>
+#include <linux/inotify.h>
 #include "audit.h"
@@ -89,6 +90,7 @@ static int	audit_backlog_wait_overflow = 0;
 /* The identity of the user shutting down the audit system. */
 uid_t           audit_sig_uid = -1;
 pid_t           audit_sig_pid = -1;
+u32             audit_sig_sid = 0;
 /* Records can be lost in several ways:
   0) [suppressed in audit_alloc]
@@ -102,6 +104,12 @@ static atomic_t    audit_lost = ATOMIC_INIT(0);
 /* The netlink socket. */
 static struct sock *audit_sock;
+/* Inotify handle. */
+struct inotify_handle *audit_ih;
+/* Hash for inode-based rules */
+struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
 /* The audit_freelist is a list of pre-allocated audit buffers (if more
 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
 * being placed on the freelist). */
@@ -114,10 +122,8 @@ static struct task_struct *kauditd_task;
 static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
-/* The netlink socket is only to be read by 1 CPU, which lets us assume
+/* Serialize requests from userspace. */
- * that list additions and deletions never happen simultaneously in
+static DEFINE_MUTEX(audit_cmd_mutex);
- * auditsc.c */
-DEFINE_MUTEX(audit_netlink_mutex);
 /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
 * audit records.  Since printk uses a 1024 byte buffer, this buffer
@@ -250,7 +256,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
                        "audit_rate_limit=%d old=%d by auid=%u",
                        limit, old, loginuid);
        audit_rate_limit = limit;
-        return old;
+        return 0;
 }
 static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
@@ -273,7 +279,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
                        "audit_backlog_limit=%d old=%d by auid=%u",
                        limit, old, loginuid);
        audit_backlog_limit = limit;
-        return old;
+        return 0;
 }
 static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
@@ -299,7 +305,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
                        "audit_enabled=%d old=%d by auid=%u",
                        state, old, loginuid);
        audit_enabled = state;
-        return old;
+        return 0;
 }
 static int audit_set_failure(int state, uid_t loginuid, u32 sid)
@@ -327,7 +333,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
                        "audit_failure=%d old=%d by auid=%u",
                        state, old, loginuid);
        audit_failure = state;
-        return old;
+        return 0;
 }
 static int kauditd_thread(void *dummy)
@@ -363,9 +369,52 @@ static int kauditd_thread(void *dummy)
                        remove_wait_queue(&kauditd_wait, &wait);
                }
        }
+}
+int audit_send_list(void *_dest)
+{
+        struct audit_netlink_list *dest = _dest;
+        int pid = dest->pid;
+        struct sk_buff *skb;
+        /* wait for parent to finish and send an ACK */
+        mutex_lock(&audit_cmd_mutex);
+        mutex_unlock(&audit_cmd_mutex);
+        while ((skb = __skb_dequeue(&dest->q)) != NULL)
+                netlink_unicast(audit_sock, skb, pid, 0);
+        kfree(dest);
        return 0;
 }
+struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
+                                 int multi, void *payload, int size)
+{
+        struct sk_buff  *skb;
+        struct nlmsghdr *nlh;
+        int             len = NLMSG_SPACE(size);
+        void            *data;
+        int             flags = multi ? NLM_F_MULTI : 0;
+        int             t     = done  ? NLMSG_DONE  : type;
+        skb = alloc_skb(len, GFP_KERNEL);
+        if (!skb)
+                return NULL;
+        nlh              = NLMSG_PUT(skb, pid, seq, t, size);
+        nlh->nlmsg_flags = flags;
+        data             = NLMSG_DATA(nlh);
+        memcpy(data, payload, size);
+        return skb;
+nlmsg_failure:                  /* Used by NLMSG_PUT */
+        if (skb)
+                kfree_skb(skb);
+        return NULL;
+}
 /**
 * audit_send_reply - send an audit reply message via netlink
 * @pid: process id to send reply to
@@ -383,29 +432,13 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
                      void *payload, int size)
 {
        struct sk_buff  *skb;
-        struct nlmsghdr *nlh;
+        skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
-        int             len = NLMSG_SPACE(size);
-        void            *data;
-        int             flags = multi ? NLM_F_MULTI : 0;
-        int             t     = done  ? NLMSG_DONE  : type;
-        skb = alloc_skb(len, GFP_KERNEL);
        if (!skb)
                return;
-        nlh              = NLMSG_PUT(skb, pid, seq, t, size);
-        nlh->nlmsg_flags = flags;
-        data             = NLMSG_DATA(nlh);
-        memcpy(data, payload, size);
        /* Ignore failure. It'll only happen if the sender goes away,
           because our timeout is set to infinite. */
        netlink_unicast(audit_sock, skb, pid, 0);
        return;
-nlmsg_failure:                  /* Used by NLMSG_PUT */
-        if (skb)
-                kfree_skb(skb);
 }
 /*
@@ -451,7 +484,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        struct audit_buffer     *ab;
        u16                     msg_type = nlh->nlmsg_type;
        uid_t                   loginuid; /* loginuid of sender */
-        struct audit_sig_info   sig_data;
+        struct audit_sig_info   *sig_data;
+        char                    *ctx;
+        u32                     len;
        err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
        if (err)
@@ -503,12 +538,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (status_get->mask & AUDIT_STATUS_PID) {
                        int old   = audit_pid;
                        if (sid) {
-                                char *ctx = NULL;
+                                if ((err = selinux_ctxid_to_string(
-                                u32 len;
-                                int rc;
-                                if ((rc = selinux_ctxid_to_string(
                                                sid, &ctx, &len)))
-                                        return rc;
+                                        return err;
                                else
                                        audit_log(NULL, GFP_KERNEL,
                                                AUDIT_CONFIG_CHANGE,
@@ -523,10 +555,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        audit_pid = status_get->pid;
                }
                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
-                        audit_set_rate_limit(status_get->rate_limit,
+                        err = audit_set_rate_limit(status_get->rate_limit,
                                                         loginuid, sid);
                if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
-                        audit_set_backlog_limit(status_get->backlog_limit,
+                        err = audit_set_backlog_limit(status_get->backlog_limit,
                                                        loginuid, sid);
                break;
        case AUDIT_USER:
@@ -544,8 +576,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                                 "user pid=%d uid=%u auid=%u",
                                                 pid, uid, loginuid);
                                if (sid) {
-                                        char *ctx = NULL;
-                                        u32 len;
                                        if (selinux_ctxid_to_string(
                                                        sid, &ctx, &len)) {
                                                audit_log_format(ab, 
@@ -584,10 +614,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                           loginuid, sid);
                break;
        case AUDIT_SIGNAL_INFO:
-                sig_data.uid = audit_sig_uid;
+                err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len);
-                sig_data.pid = audit_sig_pid;
+                if (err)
+                        return err;
+                sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
+                if (!sig_data) {
+                        kfree(ctx);
+                        return -ENOMEM;
+                }
+                sig_data->uid = audit_sig_uid;
+                sig_data->pid = audit_sig_pid;
+                memcpy(sig_data->ctx, ctx, len);
+                kfree(ctx);
                audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 
-                                0, 0, &sig_data, sizeof(sig_data));
+                                0, 0, sig_data, sizeof(*sig_data) + len);
+                kfree(sig_data);
                break;
        default:
                err = -EINVAL;
@@ -629,20 +670,30 @@ static void audit_receive(struct sock *sk, int length)
        struct sk_buff  *skb;
        unsigned int qlen;
-        mutex_lock(&audit_netlink_mutex);
+        mutex_lock(&audit_cmd_mutex);
        for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
                skb = skb_dequeue(&sk->sk_receive_queue);
                audit_receive_skb(skb);
                kfree_skb(skb);
        }
-        mutex_unlock(&audit_netlink_mutex);
+        mutex_unlock(&audit_cmd_mutex);
 }
+#ifdef CONFIG_AUDITSYSCALL
+static const struct inotify_operations audit_inotify_ops = {
+        .handle_event   = audit_handle_ievent,
+        .destroy_watch  = audit_free_parent,
+};
+#endif
 /* Initialize audit support at boot time. */
 static int __init audit_init(void)
 {
+#ifdef CONFIG_AUDITSYSCALL
+        int i;
+#endif
        printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
               audit_default ? "enabled" : "disabled");
        audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
@@ -661,6 +712,16 @@ static int __init audit_init(void)
        selinux_audit_set_callback(&selinux_audit_rule_update);
        audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
+#ifdef CONFIG_AUDITSYSCALL
+        audit_ih = inotify_init(&audit_inotify_ops);
+        if (IS_ERR(audit_ih))
+                audit_panic("cannot initialize inotify handle");
+        for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
+                INIT_LIST_HEAD(&audit_inode_hash[i]);
+#endif
        return 0;
 }
 __initcall(audit_init);
@@ -690,10 +751,12 @@ static void audit_buffer_free(struct audit_buffer *ab)
                kfree_skb(ab->skb);
        spin_lock_irqsave(&audit_freelist_lock, flags);
-        if (++audit_freelist_count > AUDIT_MAXFREE)
+        if (audit_freelist_count > AUDIT_MAXFREE)
                kfree(ab);
-        else
+        else {
+                audit_freelist_count++;
                list_add(&ab->list, &audit_freelist);
+        }
        spin_unlock_irqrestore(&audit_freelist_lock, flags);
 }
@@ -988,28 +1051,76 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
        skb_put(skb, len << 1); /* new string is twice the old string */
 }
+/*
+ * Format a string of no more than slen characters into the audit buffer,
+ * enclosed in quote marks.
+ */
+static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
+                               const char *string)
+{
+        int avail, new_len;
+        unsigned char *ptr;
+        struct sk_buff *skb;
+        BUG_ON(!ab->skb);
+        skb = ab->skb;
+        avail = skb_tailroom(skb);
+        new_len = slen + 3;     /* enclosing quotes + null terminator */
+        if (new_len > avail) {
+                avail = audit_expand(ab, new_len);
+                if (!avail)
+                        return;
+        }
+        ptr = skb->tail;
+        *ptr++ = '"';
+        memcpy(ptr, string, slen);
+        ptr += slen;
+        *ptr++ = '"';
+        *ptr = 0;
+        skb_put(skb, slen + 2); /* don't include null terminator */
+}
 /**
- * audit_log_unstrustedstring - log a string that may contain random characters
+ * audit_log_n_unstrustedstring - log a string that may contain random characters
 * @ab: audit_buffer
+ * @len: lenth of string (not including trailing null)
 * @string: string to be logged
 *
 * This code will escape a string that is passed to it if the string
 * contains a control character, unprintable character, double quote mark,
 * or a space. Unescaped strings will start and end with a double quote mark.
 * Strings that are escaped are printed in hex (2 digits per char).
+ *
+ * The caller specifies the number of characters in the string to log, which may
+ * or may not be the entire string.
 */
-void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
+const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
+                                        const char *string)
 {
        const unsigned char *p = string;
        while (*p) {
                if (*p == '"' || *p < 0x21 || *p > 0x7f) {
-                        audit_log_hex(ab, string, strlen(string));
+                        audit_log_hex(ab, string, len);
-                        return;
+                        return string + len + 1;
                }
                p++;
        }
-        audit_log_format(ab, "\"%s\"", string);
+        audit_log_n_string(ab, len, string);
+        return p + 1;
+}
+/**
+ * audit_log_unstrustedstring - log a string that may contain random characters
+ * @ab: audit_buffer
+ * @string: string to be logged
+ *
+ * Same as audit_log_n_unstrustedstring(), except that strlen is used to
+ * determine string length.
+ */
+const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
+{
+        return audit_log_n_untrustedstring(ab, strlen(string), string);
 }
 /* This is a helper-function to print the escaped d_path */
diff --git a/kernel/audit.h b/kernel/audit.h
index 6f733920fd32..8323e4132a33 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -19,9 +19,9 @@
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
-#include <linux/mutex.h>
 #include <linux/fs.h>
 #include <linux/audit.h>
+#include <linux/skbuff.h>
 /* 0 = no checking
   1 = put_count checking
@@ -53,6 +53,18 @@ enum audit_state {
 };
 /* Rule lists */
+struct audit_parent;
+struct audit_watch {
+        atomic_t                count;  /* reference count */
+        char                    *path;  /* insertion path */
+        dev_t                   dev;    /* associated superblock device */
+        unsigned long           ino;    /* associated inode number */
+        struct audit_parent     *parent; /* associated parent */
+        struct list_head        wlist;  /* entry in parent->watches list */
+        struct list_head        rules;  /* associated rules */
+};
 struct audit_field {
        u32                             type;
        u32                             val;
@@ -70,6 +82,9 @@ struct audit_krule {
        u32                     buflen; /* for data alloc on list rules */
        u32                     field_count;
        struct audit_field      *fields;
+        struct audit_field      *inode_f; /* quick access to an inode field */
+        struct audit_watch      *watch; /* associated watch */
+        struct list_head        rlist;  /* entry in audit_watch.rules list */
 };
 struct audit_entry {
@@ -78,15 +93,53 @@ struct audit_entry {
        struct audit_krule      rule;
 };
 extern int audit_pid;
-extern int audit_comparator(const u32 left, const u32 op, const u32 right);
+#define AUDIT_INODE_BUCKETS     32
+extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
+static inline int audit_hash_ino(u32 ino)
+{
+        return (ino & (AUDIT_INODE_BUCKETS-1));
+}
+extern int audit_comparator(const u32 left, const u32 op, const u32 right);
+extern int audit_compare_dname_path(const char *dname, const char *path,
+                                    int *dirlen);
+extern struct sk_buff *     audit_make_reply(int pid, int seq, int type,
+                                             int done, int multi,
+                                             void *payload, int size);
 extern void                 audit_send_reply(int pid, int seq, int type,
                                             int done, int multi,
                                             void *payload, int size);
 extern void                 audit_log_lost(const char *message);
 extern void                 audit_panic(const char *message);
-extern struct mutex audit_netlink_mutex;
+struct audit_netlink_list {
+        int pid;
+        struct sk_buff_head q;
+};
+int audit_send_list(void *);
+struct inotify_watch;
+extern void audit_free_parent(struct inotify_watch *);
+extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
+                                const char *, struct inode *);
 extern int selinux_audit_rule_update(void);
+#ifdef CONFIG_AUDITSYSCALL
+extern void __audit_signal_info(int sig, struct task_struct *t);
+static inline void audit_signal_info(int sig, struct task_struct *t)
+{
+        if (unlikely(audit_pid && t->tgid == audit_pid))
+                __audit_signal_info(sig, t);
+}
+extern enum audit_state audit_filter_inodes(struct task_struct *,
+                                            struct audit_context *);
+extern void audit_set_auditable(struct audit_context *);
+#else
+#define audit_signal_info(s,t)
+#define audit_filter_inodes(t,c) AUDIT_DISABLED
+#define audit_set_auditable(c)
+#endif
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7c134906d689..4c99d2c586ed 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -22,13 +22,59 @@
 #include <linux/kernel.h>
 #include <linux/audit.h>
 #include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/inotify.h>
 #include <linux/selinux.h>
 #include "audit.h"
-/* There are three lists of rules -- one to search at task creation
+/*
- * time, one to search at syscall entry time, and another to search at
+ * Locking model:
- * syscall exit time. */
+ *
+ * audit_filter_mutex:
+ *              Synchronizes writes and blocking reads of audit's filterlist
+ *              data.  Rcu is used to traverse the filterlist and access
+ *              contents of structs audit_entry, audit_watch and opaque
+ *              selinux rules during filtering.  If modified, these structures
+ *              must be copied and replace their counterparts in the filterlist.
+ *              An audit_parent struct is not accessed during filtering, so may
+ *              be written directly provided audit_filter_mutex is held.
+ */
+/*
+ * Reference counting:
+ *
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ *      event.  Each audit_watch holds a reference to its associated parent.
+ *
+ * audit_watch: if added to lists, lifetime is from audit_init_watch() to
+ *      audit_remove_watch().  Additionally, an audit_watch may exist
+ *      temporarily to assist in searching existing filter data.  Each
+ *      audit_krule holds a reference to its associated watch.
+ */
+struct audit_parent {
+        struct list_head        ilist;  /* entry in inotify registration list */
+        struct list_head        watches; /* associated watches */
+        struct inotify_watch    wdata;  /* inotify watch data */
+        unsigned                flags;  /* status flags */
+};
+/*
+ * audit_parent status flags:
+ *
+ * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
+ * a filesystem event to ensure we're adding audit watches to a valid parent.
+ * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
+ * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
+ * we can receive while holding nameidata.
+ */
+#define AUDIT_PARENT_INVALID    0x001
+/* Audit filter lists, defined in <linux/audit.h> */
 struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
        LIST_HEAD_INIT(audit_filter_list[0]),
        LIST_HEAD_INIT(audit_filter_list[1]),
@@ -41,9 +87,53 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 #endif
 };
+static DEFINE_MUTEX(audit_filter_mutex);
+/* Inotify handle */
+extern struct inotify_handle *audit_ih;
+/* Inotify events we care about. */
+#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+void audit_free_parent(struct inotify_watch *i_watch)
+{
+        struct audit_parent *parent;
+        parent = container_of(i_watch, struct audit_parent, wdata);
+        WARN_ON(!list_empty(&parent->watches));
+        kfree(parent);
+}
+static inline void audit_get_watch(struct audit_watch *watch)
+{
+        atomic_inc(&watch->count);
+}
+static void audit_put_watch(struct audit_watch *watch)
+{
+        if (atomic_dec_and_test(&watch->count)) {
+                WARN_ON(watch->parent);
+                WARN_ON(!list_empty(&watch->rules));
+                kfree(watch->path);
+                kfree(watch);
+        }
+}
+static void audit_remove_watch(struct audit_watch *watch)
+{
+        list_del(&watch->wlist);
+        put_inotify_watch(&watch->parent->wdata);
+        watch->parent = NULL;
+        audit_put_watch(watch); /* match initial get */
+}
 static inline void audit_free_rule(struct audit_entry *e)
 {
        int i;
+        /* some rules don't have associated watches */
+        if (e->rule.watch)
+                audit_put_watch(e->rule.watch);
        if (e->rule.fields)
                for (i = 0; i < e->rule.field_count; i++) {
                        struct audit_field *f = &e->rule.fields[i];
@@ -60,6 +150,50 @@ static inline void audit_free_rule_rcu(struct rcu_head *head)
        audit_free_rule(e);
 }
+/* Initialize a parent watch entry. */
+static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+{
+        struct audit_parent *parent;
+        s32 wd;
+        parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+        if (unlikely(!parent))
+                return ERR_PTR(-ENOMEM);
+        INIT_LIST_HEAD(&parent->watches);
+        parent->flags = 0;
+        inotify_init_watch(&parent->wdata);
+        /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
+        get_inotify_watch(&parent->wdata);
+        wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode,
+                               AUDIT_IN_WATCH);
+        if (wd < 0) {
+                audit_free_parent(&parent->wdata);
+                return ERR_PTR(wd);
+        }
+        return parent;
+}
+/* Initialize a watch entry. */
+static struct audit_watch *audit_init_watch(char *path)
+{
+        struct audit_watch *watch;
+        watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+        if (unlikely(!watch))
+                return ERR_PTR(-ENOMEM);
+        INIT_LIST_HEAD(&watch->rules);
+        atomic_set(&watch->count, 1);
+        watch->path = path;
+        watch->dev = (dev_t)-1;
+        watch->ino = (unsigned long)-1;
+        return watch;
+}
 /* Initialize an audit filterlist entry. */
 static inline struct audit_entry *audit_init_entry(u32 field_count)
 {
@@ -107,6 +241,43 @@ static char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
        return str;
 }
+/* Translate an inode field to kernel respresentation. */
+static inline int audit_to_inode(struct audit_krule *krule,
+                                 struct audit_field *f)
+{
+        if (krule->listnr != AUDIT_FILTER_EXIT ||
+            krule->watch || krule->inode_f)
+                return -EINVAL;
+        krule->inode_f = f;
+        return 0;
+}
+/* Translate a watch string to kernel respresentation. */
+static int audit_to_watch(struct audit_krule *krule, char *path, int len,
+                          u32 op)
+{
+        struct audit_watch *watch;
+        if (!audit_ih)
+                return -EOPNOTSUPP;
+        if (path[0] != '/' || path[len-1] == '/' ||
+            krule->listnr != AUDIT_FILTER_EXIT ||
+            op & ~AUDIT_EQUAL ||
+            krule->inode_f || krule->watch) /* 1 inode # per rule, for hash */
+                return -EINVAL;
+        watch = audit_init_watch(path);
+        if (unlikely(IS_ERR(watch)))
+                return PTR_ERR(watch);
+        audit_get_watch(watch);
+        krule->watch = watch;
+        return 0;
+}
 /* Common user-space to kernel rule translation. */
 static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
 {
@@ -128,8 +299,11 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
 #endif
                ;
        }
-        if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE &&
+        if (unlikely(rule->action == AUDIT_POSSIBLE)) {
-            rule->action != AUDIT_ALWAYS)
+                printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
+                goto exit_err;
+        }
+        if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
                goto exit_err;
        if (rule->field_count > AUDIT_MAX_FIELDS)
                goto exit_err;
@@ -158,6 +332,7 @@ exit_err:
 static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 {
        struct audit_entry *entry;
+        struct audit_field *f;
        int err = 0;
        int i;
@@ -172,14 +347,37 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
                f->val = rule->values[i];
-                if (f->type & AUDIT_UNUSED_BITS ||
+                err = -EINVAL;
-                    f->type == AUDIT_SE_USER ||
+                switch(f->type) {
-                    f->type == AUDIT_SE_ROLE ||
+                default:
-                    f->type == AUDIT_SE_TYPE ||
-                    f->type == AUDIT_SE_SEN ||
-                    f->type == AUDIT_SE_CLR) {
-                        err = -EINVAL;
                        goto exit_free;
+                case AUDIT_PID:
+                case AUDIT_UID:
+                case AUDIT_EUID:
+                case AUDIT_SUID:
+                case AUDIT_FSUID:
+                case AUDIT_GID:
+                case AUDIT_EGID:
+                case AUDIT_SGID:
+                case AUDIT_FSGID:
+                case AUDIT_LOGINUID:
+                case AUDIT_PERS:
+                case AUDIT_ARCH:
+                case AUDIT_MSGTYPE:
+                case AUDIT_DEVMAJOR:
+                case AUDIT_DEVMINOR:
+                case AUDIT_EXIT:
+                case AUDIT_SUCCESS:
+                case AUDIT_ARG0:
+                case AUDIT_ARG1:
+                case AUDIT_ARG2:
+                case AUDIT_ARG3:
+                        break;
+                case AUDIT_INODE:
+                        err = audit_to_inode(&entry->rule, f);
+                        if (err)
+                                goto exit_free;
+                        break;
                }
                entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
@@ -196,6 +394,18 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                }
        }
+        f = entry->rule.inode_f;
+        if (f) {
+                switch(f->op) {
+                case AUDIT_NOT_EQUAL:
+                        entry->rule.inode_f = NULL;
+                case AUDIT_EQUAL:
+                        break;
+                default:
+                        goto exit_free;
+                }
+        }
 exit_nofree:
        return entry;
@@ -210,6 +420,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 {
        int err = 0;
        struct audit_entry *entry;
+        struct audit_field *f;
        void *bufp;
        size_t remain = datasz - sizeof(struct audit_rule_data);
        int i;
@@ -235,6 +446,29 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                f->se_str = NULL;
                f->se_rule = NULL;
                switch(f->type) {
+                case AUDIT_PID:
+                case AUDIT_UID:
+                case AUDIT_EUID:
+                case AUDIT_SUID:
+                case AUDIT_FSUID:
+                case AUDIT_GID:
+                case AUDIT_EGID:
+                case AUDIT_SGID:
+                case AUDIT_FSGID:
+                case AUDIT_LOGINUID:
+                case AUDIT_PERS:
+                case AUDIT_ARCH:
+                case AUDIT_MSGTYPE:
+                case AUDIT_PPID:
+                case AUDIT_DEVMAJOR:
+                case AUDIT_DEVMINOR:
+                case AUDIT_EXIT:
+                case AUDIT_SUCCESS:
+                case AUDIT_ARG0:
+                case AUDIT_ARG1:
+                case AUDIT_ARG2:
+                case AUDIT_ARG3:
+                        break;
                case AUDIT_SE_USER:
                case AUDIT_SE_ROLE:
                case AUDIT_SE_TYPE:
@@ -260,6 +494,37 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                        } else
                                f->se_str = str;
                        break;
+                case AUDIT_WATCH:
+                        str = audit_unpack_string(&bufp, &remain, f->val);
+                        if (IS_ERR(str))
+                                goto exit_free;
+                        entry->rule.buflen += f->val;
+                        err = audit_to_watch(&entry->rule, str, f->val, f->op);
+                        if (err) {
+                                kfree(str);
+                                goto exit_free;
+                        }
+                        break;
+                case AUDIT_INODE:
+                        err = audit_to_inode(&entry->rule, f);
+                        if (err)
+                                goto exit_free;
+                        break;
+                default:
+                        goto exit_free;
+                }
+        }
+        f = entry->rule.inode_f;
+        if (f) {
+                switch(f->op) {
+                case AUDIT_NOT_EQUAL:
+                        entry->rule.inode_f = NULL;
+                case AUDIT_EQUAL:
+                        break;
+                default:
+                        goto exit_free;
                }
        }
@@ -291,7 +556,7 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
        rule = kmalloc(sizeof(*rule), GFP_KERNEL);
        if (unlikely(!rule))
-                return ERR_PTR(-ENOMEM);
+                return NULL;
        memset(rule, 0, sizeof(*rule));
        rule->flags = krule->flags | krule->listnr;
@@ -322,7 +587,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
        data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
        if (unlikely(!data))
-                return ERR_PTR(-ENOMEM);
+                return NULL;
        memset(data, 0, sizeof(*data));
        data->flags = krule->flags | krule->listnr;
@@ -343,6 +608,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp, f->se_str);
                        break;
+                case AUDIT_WATCH:
+                        data->buflen += data->values[i] =
+                                audit_pack_string(&bufp, krule->watch->path);
+                        break;
                default:
                        data->values[i] = f->val;
                }
@@ -378,6 +647,10 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
                        if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
                                return 1;
                        break;
+                case AUDIT_WATCH:
+                        if (strcmp(a->watch->path, b->watch->path))
+                                return 1;
+                        break;
                default:
                        if (a->fields[i].val != b->fields[i].val)
                                return 1;
@@ -391,6 +664,32 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
        return 0;
 }
+/* Duplicate the given audit watch.  The new watch's rules list is initialized
+ * to an empty list and wlist is undefined. */
+static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
+{
+        char *path;
+        struct audit_watch *new;
+        path = kstrdup(old->path, GFP_KERNEL);
+        if (unlikely(!path))
+                return ERR_PTR(-ENOMEM);
+        new = audit_init_watch(path);
+        if (unlikely(IS_ERR(new))) {
+                kfree(path);
+                goto out;
+        }
+        new->dev = old->dev;
+        new->ino = old->ino;
+        get_inotify_watch(&old->parent->wdata);
+        new->parent = old->parent;
+out:
+        return new;
+}
 /* Duplicate selinux field information.  The se_rule is opaque, so must be
 * re-initialized. */
 static inline int audit_dupe_selinux_field(struct audit_field *df,
@@ -422,8 +721,11 @@ static inline int audit_dupe_selinux_field(struct audit_field *df,
 /* Duplicate an audit rule.  This will be a deep copy with the exception
 * of the watch - that pointer is carried over.  The selinux specific fields
 * will be updated in the copy.  The point is to be able to replace the old
- * rule with the new rule in the filterlist, then free the old rule. */
+ * rule with the new rule in the filterlist, then free the old rule.
-static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
+ * The rlist element is undefined; list manipulations are handled apart from
+ * the initial copy. */
+static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+                                           struct audit_watch *watch)
 {
        u32 fcount = old->field_count;
        struct audit_entry *entry;
@@ -442,6 +744,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
                new->mask[i] = old->mask[i];
        new->buflen = old->buflen;
+        new->inode_f = old->inode_f;
+        new->watch = NULL;
        new->field_count = old->field_count;
        memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
@@ -463,68 +767,409 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
                }
        }
+        if (watch) {
+                audit_get_watch(watch);
+                new->watch = watch;
+        }
        return entry;
 }
-/* Add rule to given filterlist if not a duplicate.  Protected by
+/* Update inode info in audit rules based on filesystem event. */
- * audit_netlink_mutex. */
+static void audit_update_watch(struct audit_parent *parent,
+                               const char *dname, dev_t dev,
+                               unsigned long ino, unsigned invalidating)
+{
+        struct audit_watch *owatch, *nwatch, *nextw;
+        struct audit_krule *r, *nextr;
+        struct audit_entry *oentry, *nentry;
+        struct audit_buffer *ab;
+        mutex_lock(&audit_filter_mutex);
+        list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
+                if (audit_compare_dname_path(dname, owatch->path, NULL))
+                        continue;
+                /* If the update involves invalidating rules, do the inode-based
+                 * filtering now, so we don't omit records. */
+                if (invalidating &&
+                    audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT)
+                        audit_set_auditable(current->audit_context);
+                nwatch = audit_dupe_watch(owatch);
+                if (unlikely(IS_ERR(nwatch))) {
+                        mutex_unlock(&audit_filter_mutex);
+                        audit_panic("error updating watch, skipping");
+                        return;
+                }
+                nwatch->dev = dev;
+                nwatch->ino = ino;
+                list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
+                        oentry = container_of(r, struct audit_entry, rule);
+                        list_del(&oentry->rule.rlist);
+                        list_del_rcu(&oentry->list);
+                        nentry = audit_dupe_rule(&oentry->rule, nwatch);
+                        if (unlikely(IS_ERR(nentry)))
+                                audit_panic("error updating watch, removing");
+                        else {
+                                int h = audit_hash_ino((u32)ino);
+                                list_add(&nentry->rule.rlist, &nwatch->rules);
+                                list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+                        }
+                        call_rcu(&oentry->rcu, audit_free_rule_rcu);
+                }
+                ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+                audit_log_format(ab, "audit updated rules specifying watch=");
+                audit_log_untrustedstring(ab, owatch->path);
+                audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
+                audit_log_end(ab);
+                audit_remove_watch(owatch);
+                goto add_watch_to_parent; /* event applies to a single watch */
+        }
+        mutex_unlock(&audit_filter_mutex);
+        return;
+add_watch_to_parent:
+        list_add(&nwatch->wlist, &parent->watches);
+        mutex_unlock(&audit_filter_mutex);
+        return;
+}
+/* Remove all watches & rules associated with a parent that is going away. */
+static void audit_remove_parent_watches(struct audit_parent *parent)
+{
+        struct audit_watch *w, *nextw;
+        struct audit_krule *r, *nextr;
+        struct audit_entry *e;
+        mutex_lock(&audit_filter_mutex);
+        parent->flags |= AUDIT_PARENT_INVALID;
+        list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
+                list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
+                        e = container_of(r, struct audit_entry, rule);
+                        list_del(&r->rlist);
+                        list_del_rcu(&e->list);
+                        call_rcu(&e->rcu, audit_free_rule_rcu);
+                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                                 "audit implicitly removed rule from list=%d\n",
+                                  AUDIT_FILTER_EXIT);
+                }
+                audit_remove_watch(w);
+        }
+        mutex_unlock(&audit_filter_mutex);
+}
+/* Unregister inotify watches for parents on in_list.
+ * Generates an IN_IGNORED event. */
+static void audit_inotify_unregister(struct list_head *in_list)
+{
+        struct audit_parent *p, *n;
+        list_for_each_entry_safe(p, n, in_list, ilist) {
+                list_del(&p->ilist);
+                inotify_rm_watch(audit_ih, &p->wdata);
+                /* the put matching the get in audit_do_del_rule() */
+                put_inotify_watch(&p->wdata);
+        }
+}
+/* Find an existing audit rule.
+ * Caller must hold audit_filter_mutex to prevent stale rule data. */
+static struct audit_entry *audit_find_rule(struct audit_entry *entry,
+                                           struct list_head *list)
+{
+        struct audit_entry *e, *found = NULL;
+        int h;
+        if (entry->rule.watch) {
+                /* we don't know the inode number, so must walk entire hash */
+                for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
+                        list = &audit_inode_hash[h];
+                        list_for_each_entry(e, list, list)
+                                if (!audit_compare_rule(&entry->rule, &e->rule)) {
+                                        found = e;
+                                        goto out;
+                                }
+                }
+                goto out;
+        }
+        list_for_each_entry(e, list, list)
+                if (!audit_compare_rule(&entry->rule, &e->rule)) {
+                        found = e;
+                        goto out;
+                }
+out:
+        return found;
+}
+/* Get path information necessary for adding watches. */
+static int audit_get_nd(char *path, struct nameidata **ndp,
+                        struct nameidata **ndw)
+{
+        struct nameidata *ndparent, *ndwatch;
+        int err;
+        ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
+        if (unlikely(!ndparent))
+                return -ENOMEM;
+        ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
+        if (unlikely(!ndwatch)) {
+                kfree(ndparent);
+                return -ENOMEM;
+        }
+        err = path_lookup(path, LOOKUP_PARENT, ndparent);
+        if (err) {
+                kfree(ndparent);
+                kfree(ndwatch);
+                return err;
+        }
+        err = path_lookup(path, 0, ndwatch);
+        if (err) {
+                kfree(ndwatch);
+                ndwatch = NULL;
+        }
+        *ndp = ndparent;
+        *ndw = ndwatch;
+        return 0;
+}
+/* Release resources used for watch path information. */
+static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
+{
+        if (ndp) {
+                path_release(ndp);
+                kfree(ndp);
+        }
+        if (ndw) {
+                path_release(ndw);
+                kfree(ndw);
+        }
+}
+/* Associate the given rule with an existing parent inotify_watch.
+ * Caller must hold audit_filter_mutex. */
+static void audit_add_to_parent(struct audit_krule *krule,
+                                struct audit_parent *parent)
+{
+        struct audit_watch *w, *watch = krule->watch;
+        int watch_found = 0;
+        list_for_each_entry(w, &parent->watches, wlist) {
+                if (strcmp(watch->path, w->path))
+                        continue;
+                watch_found = 1;
+                /* put krule's and initial refs to temporary watch */
+                audit_put_watch(watch);
+                audit_put_watch(watch);
+                audit_get_watch(w);
+                krule->watch = watch = w;
+                break;
+        }
+        if (!watch_found) {
+                get_inotify_watch(&parent->wdata);
+                watch->parent = parent;
+                list_add(&watch->wlist, &parent->watches);
+        }
+        list_add(&krule->rlist, &watch->rules);
+}
+/* Find a matching watch entry, or add this one.
+ * Caller must hold audit_filter_mutex. */
+static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
+                           struct nameidata *ndw)
+{
+        struct audit_watch *watch = krule->watch;
+        struct inotify_watch *i_watch;
+        struct audit_parent *parent;
+        int ret = 0;
+        /* update watch filter fields */
+        if (ndw) {
+                watch->dev = ndw->dentry->d_inode->i_sb->s_dev;
+                watch->ino = ndw->dentry->d_inode->i_ino;
+        }
+        /* The audit_filter_mutex must not be held during inotify calls because
+         * we hold it during inotify event callback processing.  If an existing
+         * inotify watch is found, inotify_find_watch() grabs a reference before
+         * returning.
+         */
+        mutex_unlock(&audit_filter_mutex);
+        if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) {
+                parent = audit_init_parent(ndp);
+                if (IS_ERR(parent)) {
+                        /* caller expects mutex locked */
+                        mutex_lock(&audit_filter_mutex);
+                        return PTR_ERR(parent);
+                }
+        } else
+                parent = container_of(i_watch, struct audit_parent, wdata);
+        mutex_lock(&audit_filter_mutex);
+        /* parent was moved before we took audit_filter_mutex */
+        if (parent->flags & AUDIT_PARENT_INVALID)
+                ret = -ENOENT;
+        else
+                audit_add_to_parent(krule, parent);
+        /* match get in audit_init_parent or inotify_find_watch */
+        put_inotify_watch(&parent->wdata);
+        return ret;
+}
+/* Add rule to given filterlist if not a duplicate. */
 static inline int audit_add_rule(struct audit_entry *entry,
-                                  struct list_head *list)
+                                 struct list_head *list)
 {
        struct audit_entry *e;
+        struct audit_field *inode_f = entry->rule.inode_f;
+        struct audit_watch *watch = entry->rule.watch;
+        struct nameidata *ndp, *ndw;
+        int h, err, putnd_needed = 0;
+        if (inode_f) {
+                h = audit_hash_ino(inode_f->val);
+                list = &audit_inode_hash[h];
+        }
-        /* Do not use the _rcu iterator here, since this is the only
+        mutex_lock(&audit_filter_mutex);
-         * addition routine. */
+        e = audit_find_rule(entry, list);
-        list_for_each_entry(e, list, list) {
+        mutex_unlock(&audit_filter_mutex);
-                if (!audit_compare_rule(&entry->rule, &e->rule))
+        if (e) {
-                        return -EEXIST;
+                err = -EEXIST;
+                goto error;
+        }
+        /* Avoid calling path_lookup under audit_filter_mutex. */
+        if (watch) {
+                err = audit_get_nd(watch->path, &ndp, &ndw);
+                if (err)
+                        goto error;
+                putnd_needed = 1;
+        }
+        mutex_lock(&audit_filter_mutex);
+        if (watch) {
+                /* audit_filter_mutex is dropped and re-taken during this call */
+                err = audit_add_watch(&entry->rule, ndp, ndw);
+                if (err) {
+                        mutex_unlock(&audit_filter_mutex);
+                        goto error;
+                }
+                h = audit_hash_ino((u32)watch->ino);
+                list = &audit_inode_hash[h];
        }
        if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
                list_add_rcu(&entry->list, list);
+                entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
        } else {
                list_add_tail_rcu(&entry->list, list);
        }
+        mutex_unlock(&audit_filter_mutex);
-        return 0;
+        if (putnd_needed)
+                audit_put_nd(ndp, ndw);
+        return 0;
+error:
+        if (putnd_needed)
+                audit_put_nd(ndp, ndw);
+        if (watch)
+                audit_put_watch(watch); /* tmp watch, matches initial get */
+        return err;
 }
-/* Remove an existing rule from filterlist.  Protected by
+/* Remove an existing rule from filterlist. */
- * audit_netlink_mutex. */
 static inline int audit_del_rule(struct audit_entry *entry,
                                 struct list_head *list)
 {
        struct audit_entry  *e;
+        struct audit_field *inode_f = entry->rule.inode_f;
+        struct audit_watch *watch, *tmp_watch = entry->rule.watch;
+        LIST_HEAD(inotify_list);
+        int h, ret = 0;
+        if (inode_f) {
+                h = audit_hash_ino(inode_f->val);
+                list = &audit_inode_hash[h];
+        }
-        /* Do not use the _rcu iterator here, since this is the only
+        mutex_lock(&audit_filter_mutex);
-         * deletion routine. */
+        e = audit_find_rule(entry, list);
-        list_for_each_entry(e, list, list) {
+        if (!e) {
-                if (!audit_compare_rule(&entry->rule, &e->rule)) {
+                mutex_unlock(&audit_filter_mutex);
-                        list_del_rcu(&e->list);
+                ret = -ENOENT;
-                        call_rcu(&e->rcu, audit_free_rule_rcu);
+                goto out;
-                        return 0;
+        }
+        watch = e->rule.watch;
+        if (watch) {
+                struct audit_parent *parent = watch->parent;
+                list_del(&e->rule.rlist);
+                if (list_empty(&watch->rules)) {
+                        audit_remove_watch(watch);
+                        if (list_empty(&parent->watches)) {
+                                /* Put parent on the inotify un-registration
+                                 * list.  Grab a reference before releasing
+                                 * audit_filter_mutex, to be released in
+                                 * audit_inotify_unregister(). */
+                                list_add(&parent->ilist, &inotify_list);
+                                get_inotify_watch(&parent->wdata);
+                        }
                }
        }
-        return -ENOENT;         /* No matching rule */
+        list_del_rcu(&e->list);
+        call_rcu(&e->rcu, audit_free_rule_rcu);
+        mutex_unlock(&audit_filter_mutex);
+        if (!list_empty(&inotify_list))
+                audit_inotify_unregister(&inotify_list);
+out:
+        if (tmp_watch)
+                audit_put_watch(tmp_watch); /* match initial get */
+        return ret;
 }
 /* List rules using struct audit_rule.  Exists for backward
 * compatibility with userspace. */
-static int audit_list(void *_dest)
+static void audit_list(int pid, int seq, struct sk_buff_head *q)
 {
-        int pid, seq;
+        struct sk_buff *skb;
-        int *dest = _dest;
        struct audit_entry *entry;
        int i;
-        pid = dest[0];
+        /* This is a blocking read, so use audit_filter_mutex instead of rcu
-        seq = dest[1];
+         * iterator to sync with list writers. */
-        kfree(dest);
-        mutex_lock(&audit_netlink_mutex);
-        /* The *_rcu iterators not needed here because we are
-           always called with audit_netlink_mutex held. */
        for (i=0; i<AUDIT_NR_FILTERS; i++) {
                list_for_each_entry(entry, &audit_filter_list[i], list) {
                        struct audit_rule *rule;
@@ -532,33 +1177,41 @@ static int audit_list(void *_dest)
                        rule = audit_krule_to_rule(&entry->rule);
                        if (unlikely(!rule))
                                break;
-                        audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+                        skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
                                         rule, sizeof(*rule));
+                        if (skb)
+                                skb_queue_tail(q, skb);
                        kfree(rule);
                }
        }
-        audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+        for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
-        
+                list_for_each_entry(entry, &audit_inode_hash[i], list) {
-        mutex_unlock(&audit_netlink_mutex);
+                        struct audit_rule *rule;
-        return 0;
+                        rule = audit_krule_to_rule(&entry->rule);
+                        if (unlikely(!rule))
+                                break;
+                        skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
+                                         rule, sizeof(*rule));
+                        if (skb)
+                                skb_queue_tail(q, skb);
+                        kfree(rule);
+                }
+        }
+        skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+        if (skb)
+                skb_queue_tail(q, skb);
 }
 /* List rules using struct audit_rule_data. */
-static int audit_list_rules(void *_dest)
+static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
 {
-        int pid, seq;
+        struct sk_buff *skb;
-        int *dest = _dest;
        struct audit_entry *e;
        int i;
-        pid = dest[0];
+        /* This is a blocking read, so use audit_filter_mutex instead of rcu
-        seq = dest[1];
+         * iterator to sync with list writers. */
-        kfree(dest);
-        mutex_lock(&audit_netlink_mutex);
-        /* The *_rcu iterators not needed here because we are
-           always called with audit_netlink_mutex held. */
        for (i=0; i<AUDIT_NR_FILTERS; i++) {
                list_for_each_entry(e, &audit_filter_list[i], list) {
                        struct audit_rule_data *data;
@@ -566,15 +1219,30 @@ static int audit_list_rules(void *_dest)
                        data = audit_krule_to_data(&e->rule);
                        if (unlikely(!data))
                                break;
-                        audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
+                        skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
-                                         data, sizeof(*data));
+                                         data, sizeof(*data) + data->buflen);
+                        if (skb)
+                                skb_queue_tail(q, skb);
                        kfree(data);
                }
        }
-        audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+        for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
+                list_for_each_entry(e, &audit_inode_hash[i], list) {
+                        struct audit_rule_data *data;
-        mutex_unlock(&audit_netlink_mutex);
+                        data = audit_krule_to_data(&e->rule);
-        return 0;
+                        if (unlikely(!data))
+                                break;
+                        skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
+                                         data, sizeof(*data) + data->buflen);
+                        if (skb)
+                                skb_queue_tail(q, skb);
+                        kfree(data);
+                }
+        }
+        skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+        if (skb)
+                skb_queue_tail(q, skb);
 }
 /**
@@ -592,7 +1260,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                         size_t datasz, uid_t loginuid, u32 sid)
 {
        struct task_struct *tsk;
-        int *dest;
+        struct audit_netlink_list *dest;
        int err = 0;
        struct audit_entry *entry;
@@ -605,18 +1273,22 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                 * happen if we're actually running in the context of auditctl
                 * trying to _send_ the stuff */
                 
-                dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
+                dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
                if (!dest)
                        return -ENOMEM;
-                dest[0] = pid;
+                dest->pid = pid;
-                dest[1] = seq;
+                skb_queue_head_init(&dest->q);
+                mutex_lock(&audit_filter_mutex);
                if (type == AUDIT_LIST)
-                        tsk = kthread_run(audit_list, dest, "audit_list");
+                        audit_list(pid, seq, &dest->q);
                else
-                        tsk = kthread_run(audit_list_rules, dest,
+                        audit_list_rules(pid, seq, &dest->q);
-                                          "audit_list_rules");
+                mutex_unlock(&audit_filter_mutex);
+                tsk = kthread_run(audit_send_list, dest, "audit_send_list");
                if (IS_ERR(tsk)) {
+                        skb_queue_purge(&dest->q);
                        kfree(dest);
                        err = PTR_ERR(tsk);
                }
@@ -632,6 +1304,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                err = audit_add_rule(entry,
                                     &audit_filter_list[entry->rule.listnr]);
                if (sid) {
                        char *ctx = NULL;
                        u32 len;
@@ -712,7 +1385,43 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
        return 0;
 }
+/* Compare given dentry name with last component in given path,
+ * return of 0 indicates a match. */
+int audit_compare_dname_path(const char *dname, const char *path,
+                             int *dirlen)
+{
+        int dlen, plen;
+        const char *p;
+        if (!dname || !path)
+                return 1;
+        dlen = strlen(dname);
+        plen = strlen(path);
+        if (plen < dlen)
+                return 1;
+        /* disregard trailing slashes */
+        p = path + plen - 1;
+        while ((*p == '/') && (p > path))
+                p--;
+        /* find last path component */
+        p = p - dlen + 1;
+        if (p < path)
+                return 1;
+        else if (p > path) {
+                if (*--p != '/')
+                        return 1;
+                else
+                        p++;
+        }
+        /* return length of path's directory component */
+        if (dirlen)
+                *dirlen = p - path;
+        return strncmp(p, dname, dlen);
+}
 static int audit_filter_user_rules(struct netlink_skb_parms *cb,
                                   struct audit_krule *rule,
@@ -744,7 +1453,6 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
        }
        switch (rule->action) {
        case AUDIT_NEVER:    *state = AUDIT_DISABLED;       break;
-        case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
        case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
        }
        return 1;
@@ -826,32 +1534,65 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule)
 int selinux_audit_rule_update(void)
 {
        struct audit_entry *entry, *n, *nentry;
+        struct audit_watch *watch;
        int i, err = 0;
-        /* audit_netlink_mutex synchronizes the writers */
+        /* audit_filter_mutex synchronizes the writers */
-        mutex_lock(&audit_netlink_mutex);
+        mutex_lock(&audit_filter_mutex);
        for (i = 0; i < AUDIT_NR_FILTERS; i++) {
                list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
                        if (!audit_rule_has_selinux(&entry->rule))
                                continue;
-                        nentry = audit_dupe_rule(&entry->rule);
+                        watch = entry->rule.watch;
+                        nentry = audit_dupe_rule(&entry->rule, watch);
                        if (unlikely(IS_ERR(nentry))) {
                                /* save the first error encountered for the
                                 * return value */
                                if (!err)
                                        err = PTR_ERR(nentry);
                                audit_panic("error updating selinux filters");
+                                if (watch)
+                                        list_del(&entry->rule.rlist);
                                list_del_rcu(&entry->list);
                        } else {
+                                if (watch) {
+                                        list_add(&nentry->rule.rlist,
+                                                 &watch->rules);
+                                        list_del(&entry->rule.rlist);
+                                }
                                list_replace_rcu(&entry->list, &nentry->list);
                        }
                        call_rcu(&entry->rcu, audit_free_rule_rcu);
                }
        }
-        mutex_unlock(&audit_netlink_mutex);
+        mutex_unlock(&audit_filter_mutex);
        return err;
 }
+/* Update watch data in audit rules based on inotify events. */
+void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
+                         u32 cookie, const char *dname, struct inode *inode)
+{
+        struct audit_parent *parent;
+        parent = container_of(i_watch, struct audit_parent, wdata);
+        if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
+                audit_update_watch(parent, dname, inode->i_sb->s_dev,
+                                   inode->i_ino, 0);
+        else if (mask & (IN_DELETE|IN_MOVED_FROM))
+                audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+        /* inotify automatically removes the watch and sends IN_IGNORED */
+        else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
+                audit_remove_parent_watches(parent);
+        /* inotify does not remove the watch, so remove it manually */
+        else if(mask & IN_MOVE_SELF) {
+                audit_remove_parent_watches(parent);
+                inotify_remove_watch_locked(audit_ih, i_watch);
+        } else if (mask & IN_IGNORED)
+                put_inotify_watch(i_watch);
+}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c03a4ed1b27..9ebd96fda295 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -3,7 +3,7 @@
 *
 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
 * Copyright 2005 Hewlett-Packard Development Company, L.P.
- * Copyright (C) 2005 IBM Corporation
+ * Copyright (C) 2005, 2006 IBM Corporation
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify
@@ -29,6 +29,9 @@
 * this file -- see entry.S) is based on a GPL'd patch written by
 * okir@suse.de and Copyright 2003 SuSE Linux AG.
 *
+ * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>,
+ * 2006.
+ *
 * The support of additional filter rules compares (>, <, >=, <=) was
 * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
 *
@@ -49,6 +52,7 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/socket.h>
+#include <linux/mqueue.h>
 #include <linux/audit.h>
 #include <linux/personality.h>
 #include <linux/time.h>
@@ -59,6 +63,8 @@
 #include <linux/list.h>
 #include <linux/tty.h>
 #include <linux/selinux.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
 #include "audit.h"
@@ -76,6 +82,9 @@ extern int audit_enabled;
 * path_lookup. */
 #define AUDIT_NAMES_RESERVED 7
+/* Indicates that audit should log the full pathname. */
+#define AUDIT_NAME_FULL -1
 /* When fs/namei.c:getname() is called, we store the pointer in name and
 * we don't let putname() free it (instead we free all of the saved
 * pointers at syscall exit time).
@@ -83,8 +92,9 @@ extern int audit_enabled;
 * Further, in fs/namei.c:path_lookup() we store the inode and device. */
 struct audit_names {
        const char      *name;
+        int             name_len;       /* number of name's characters to log */
+        unsigned        name_put;       /* call __putname() for this name */
        unsigned long   ino;
-        unsigned long   pino;
        dev_t           dev;
        umode_t         mode;
        uid_t           uid;
@@ -100,6 +110,33 @@ struct audit_aux_data {
 #define AUDIT_AUX_IPCPERM       0
+struct audit_aux_data_mq_open {
+        struct audit_aux_data   d;
+        int                     oflag;
+        mode_t                  mode;
+        struct mq_attr          attr;
+};
+struct audit_aux_data_mq_sendrecv {
+        struct audit_aux_data   d;
+        mqd_t                   mqdes;
+        size_t                  msg_len;
+        unsigned int            msg_prio;
+        struct timespec         abs_timeout;
+};
+struct audit_aux_data_mq_notify {
+        struct audit_aux_data   d;
+        mqd_t                   mqdes;
+        struct sigevent         notification;
+};
+struct audit_aux_data_mq_getsetattr {
+        struct audit_aux_data   d;
+        mqd_t                   mqdes;
+        struct mq_attr          mqstat;
+};
 struct audit_aux_data_ipcctl {
        struct audit_aux_data   d;
        struct ipc_perm         p;
@@ -110,6 +147,13 @@ struct audit_aux_data_ipcctl {
        u32                     osid;
 };
+struct audit_aux_data_execve {
+        struct audit_aux_data   d;
+        int argc;
+        int envc;
+        char mem[0];
+};
 struct audit_aux_data_socketcall {
        struct audit_aux_data   d;
        int                     nargs;
@@ -148,7 +192,7 @@ struct audit_context {
        struct audit_aux_data *aux;
                                /* Save things to print about task_struct */
-        pid_t               pid;
+        pid_t               pid, ppid;
        uid_t               uid, euid, suid, fsuid;
        gid_t               gid, egid, sgid, fsgid;
        unsigned long       personality;
@@ -160,12 +204,13 @@ struct audit_context {
 #endif
 };
+/* Determine if any context name data matches a rule's watch data */
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
 * otherwise. */
 static int audit_filter_rules(struct task_struct *tsk,
                              struct audit_krule *rule,
                              struct audit_context *ctx,
+                              struct audit_names *name,
                              enum audit_state *state)
 {
        int i, j, need_sid = 1;
@@ -179,6 +224,10 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_PID:
                        result = audit_comparator(tsk->pid, f->op, f->val);
                        break;
+                case AUDIT_PPID:
+                        if (ctx)
+                                result = audit_comparator(ctx->ppid, f->op, f->val);
+                        break;
                case AUDIT_UID:
                        result = audit_comparator(tsk->uid, f->op, f->val);
                        break;
@@ -224,7 +273,10 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_DEVMAJOR:
-                        if (ctx) {
+                        if (name)
+                                result = audit_comparator(MAJOR(name->dev),
+                                                          f->op, f->val);
+                        else if (ctx) {
                                for (j = 0; j < ctx->name_count; j++) {
                                        if (audit_comparator(MAJOR(ctx->names[j].dev),  f->op, f->val)) {
                                                ++result;
@@ -234,7 +286,10 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_DEVMINOR:
-                        if (ctx) {
+                        if (name)
+                                result = audit_comparator(MINOR(name->dev),
+                                                          f->op, f->val);
+                        else if (ctx) {
                                for (j = 0; j < ctx->name_count; j++) {
                                        if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
                                                ++result;
@@ -244,16 +299,22 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_INODE:
-                        if (ctx) {
+                        if (name)
+                                result = (name->ino == f->val);
+                        else if (ctx) {
                                for (j = 0; j < ctx->name_count; j++) {
-                                        if (audit_comparator(ctx->names[j].ino, f->op, f->val) ||
+                                        if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
-                                            audit_comparator(ctx->names[j].pino, f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
                                }
                        }
                        break;
+                case AUDIT_WATCH:
+                        if (name && rule->watch->ino != (unsigned long)-1)
+                                result = (name->dev == rule->watch->dev &&
+                                          name->ino == rule->watch->ino);
+                        break;
                case AUDIT_LOGINUID:
                        result = 0;
                        if (ctx)
@@ -294,7 +355,6 @@ static int audit_filter_rules(struct task_struct *tsk,
        }
        switch (rule->action) {
        case AUDIT_NEVER:    *state = AUDIT_DISABLED;       break;
-        case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
        case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
        }
        return 1;
@@ -311,7 +371,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
        rcu_read_lock();
        list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
-                if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+                if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
                        rcu_read_unlock();
                        return state;
                }
@@ -341,8 +401,47 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
                int bit  = AUDIT_BIT(ctx->major);
                list_for_each_entry_rcu(e, list, list) {
-                        if ((e->rule.mask[word] & bit) == bit
+                        if ((e->rule.mask[word] & bit) == bit &&
-                                        && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+                            audit_filter_rules(tsk, &e->rule, ctx, NULL,
+                                               &state)) {
+                                rcu_read_unlock();
+                                return state;
+                        }
+                }
+        }
+        rcu_read_unlock();
+        return AUDIT_BUILD_CONTEXT;
+}
+/* At syscall exit time, this filter is called if any audit_names[] have been
+ * collected during syscall processing.  We only check rules in sublists at hash
+ * buckets applicable to the inode numbers in audit_names[].
+ * Regarding audit_state, same rules apply as for audit_filter_syscall().
+ */
+enum audit_state audit_filter_inodes(struct task_struct *tsk,
+                                     struct audit_context *ctx)
+{
+        int i;
+        struct audit_entry *e;
+        enum audit_state state;
+        if (audit_pid && tsk->tgid == audit_pid)
+                return AUDIT_DISABLED;
+        rcu_read_lock();
+        for (i = 0; i < ctx->name_count; i++) {
+                int word = AUDIT_WORD(ctx->major);
+                int bit  = AUDIT_BIT(ctx->major);
+                struct audit_names *n = &ctx->names[i];
+                int h = audit_hash_ino((u32)n->ino);
+                struct list_head *list = &audit_inode_hash[h];
+                if (list_empty(list))
+                        continue;
+                list_for_each_entry_rcu(e, list, list) {
+                        if ((e->rule.mask[word] & bit) == bit &&
+                            audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
                                rcu_read_unlock();
                                return state;
                        }
@@ -352,6 +451,11 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
        return AUDIT_BUILD_CONTEXT;
 }
+void audit_set_auditable(struct audit_context *ctx)
+{
+        ctx->auditable = 1;
+}
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
                                                      int return_valid,
                                                      int return_code)
@@ -365,12 +469,22 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
        if (context->in_syscall && !context->auditable) {
                enum audit_state state;
                state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
+                if (state == AUDIT_RECORD_CONTEXT) {
+                        context->auditable = 1;
+                        goto get_context;
+                }
+                state = audit_filter_inodes(tsk, context);
                if (state == AUDIT_RECORD_CONTEXT)
                        context->auditable = 1;
        }
+get_context:
        context->pid = tsk->pid;
+        context->ppid = sys_getppid();  /* sic.  tsk == current in all cases */
        context->uid = tsk->uid;
        context->gid = tsk->gid;
        context->euid = tsk->euid;
@@ -413,7 +527,7 @@ static inline void audit_free_names(struct audit_context *context)
 #endif
        for (i = 0; i < context->name_count; i++) {
-                if (context->names[i].name)
+                if (context->names[i].name && context->names[i].name_put)
                        __putname(context->names[i].name);
        }
        context->name_count = 0;
@@ -606,7 +720,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                tty = "(none)";
        audit_log_format(ab,
                  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
-                  " pid=%d auid=%u uid=%u gid=%u"
+                  " ppid=%d pid=%d auid=%u uid=%u gid=%u"
                  " euid=%u suid=%u fsuid=%u"
                  " egid=%u sgid=%u fsgid=%u tty=%s",
                  context->argv[0],
@@ -614,6 +728,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                  context->argv[2],
                  context->argv[3],
                  context->name_count,
+                  context->ppid,
                  context->pid,
                  context->loginuid,
                  context->uid,
@@ -630,11 +745,48 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                        continue; /* audit_panic has been called */
                switch (aux->type) {
+                case AUDIT_MQ_OPEN: {
+                        struct audit_aux_data_mq_open *axi = (void *)aux;
+                        audit_log_format(ab,
+                                "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
+                                "mq_msgsize=%ld mq_curmsgs=%ld",
+                                axi->oflag, axi->mode, axi->attr.mq_flags,
+                                axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
+                                axi->attr.mq_curmsgs);
+                        break; }
+                case AUDIT_MQ_SENDRECV: {
+                        struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
+                        audit_log_format(ab,
+                                "mqdes=%d msg_len=%zd msg_prio=%u "
+                                "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
+                                axi->mqdes, axi->msg_len, axi->msg_prio,
+                                axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
+                        break; }
+                case AUDIT_MQ_NOTIFY: {
+                        struct audit_aux_data_mq_notify *axi = (void *)aux;
+                        audit_log_format(ab,
+                                "mqdes=%d sigev_signo=%d",
+                                axi->mqdes,
+                                axi->notification.sigev_signo);
+                        break; }
+                case AUDIT_MQ_GETSETATTR: {
+                        struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
+                        audit_log_format(ab,
+                                "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
+                                "mq_curmsgs=%ld ",
+                                axi->mqdes,
+                                axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
+                                axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
+                        break; }
                case AUDIT_IPC: {
                        struct audit_aux_data_ipcctl *axi = (void *)aux;
                        audit_log_format(ab, 
-                                 " qbytes=%lx iuid=%u igid=%u mode=%x",
+                                 "ouid=%u ogid=%u mode=%x",
-                                 axi->qbytes, axi->uid, axi->gid, axi->mode);
+                                 axi->uid, axi->gid, axi->mode);
                        if (axi->osid != 0) {
                                char *ctx = NULL;
                                u32 len;
@@ -652,19 +804,18 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                case AUDIT_IPC_SET_PERM: {
                        struct audit_aux_data_ipcctl *axi = (void *)aux;
                        audit_log_format(ab,
-                                " new qbytes=%lx new iuid=%u new igid=%u new mode=%x",
+                                "qbytes=%lx ouid=%u ogid=%u mode=%x",
                                axi->qbytes, axi->uid, axi->gid, axi->mode);
-                        if (axi->osid != 0) {
+                        break; }
-                                char *ctx = NULL;
-                                u32 len;
+                case AUDIT_EXECVE: {
-                                if (selinux_ctxid_to_string(
+                        struct audit_aux_data_execve *axi = (void *)aux;
-                                                axi->osid, &ctx, &len)) {
+                        int i;
-                                        audit_log_format(ab, " osid=%u",
+                        const char *p;
-                                                        axi->osid);
+                        for (i = 0, p = axi->mem; i < axi->argc; i++) {
-                                        call_panic = 1;
+                                audit_log_format(ab, "a%d=", i);
-                                } else
+                                p = audit_log_untrustedstring(ab, p);
-                                        audit_log_format(ab, " obj=%s", ctx);
+                                audit_log_format(ab, "\n");
-                                kfree(ctx);
                        }
                        break; }
@@ -700,8 +851,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                }
        }
        for (i = 0; i < context->name_count; i++) {
-                unsigned long ino  = context->names[i].ino;
+                struct audit_names *n = &context->names[i];
-                unsigned long pino = context->names[i].pino;
                ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
                if (!ab)
@@ -709,33 +859,47 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                audit_log_format(ab, "item=%d", i);
-                audit_log_format(ab, " name=");
+                if (n->name) {
-                if (context->names[i].name)
+                        switch(n->name_len) {
-                        audit_log_untrustedstring(ab, context->names[i].name);
+                        case AUDIT_NAME_FULL:
-                else
+                                /* log the full path */
-                        audit_log_format(ab, "(null)");
+                                audit_log_format(ab, " name=");
+                                audit_log_untrustedstring(ab, n->name);
-                if (pino != (unsigned long)-1)
+                                break;
-                        audit_log_format(ab, " parent=%lu",  pino);
+                        case 0:
-                if (ino != (unsigned long)-1)
+                                /* name was specified as a relative path and the
-                        audit_log_format(ab, " inode=%lu",  ino);
+                                 * directory component is the cwd */
-                if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1))
+                                audit_log_d_path(ab, " name=", context->pwd,
-                        audit_log_format(ab, " dev=%02x:%02x mode=%#o" 
+                                                 context->pwdmnt);
-                                         " ouid=%u ogid=%u rdev=%02x:%02x", 
+                                break;
-                                         MAJOR(context->names[i].dev), 
+                        default:
-                                         MINOR(context->names[i].dev), 
+                                /* log the name's directory component */
-                                         context->names[i].mode, 
+                                audit_log_format(ab, " name=");
-                                         context->names[i].uid, 
+                                audit_log_n_untrustedstring(ab, n->name_len,
-                                         context->names[i].gid, 
+                                                            n->name);
-                                         MAJOR(context->names[i].rdev), 
+                        }
-                                         MINOR(context->names[i].rdev));
+                } else
-                if (context->names[i].osid != 0) {
+                        audit_log_format(ab, " name=(null)");
+                if (n->ino != (unsigned long)-1) {
+                        audit_log_format(ab, " inode=%lu"
+                                         " dev=%02x:%02x mode=%#o"
+                                         " ouid=%u ogid=%u rdev=%02x:%02x",
+                                         n->ino,
+                                         MAJOR(n->dev),
+                                         MINOR(n->dev),
+                                         n->mode,
+                                         n->uid,
+                                         n->gid,
+                                         MAJOR(n->rdev),
+                                         MINOR(n->rdev));
+                }
+                if (n->osid != 0) {
                        char *ctx = NULL;
                        u32 len;
                        if (selinux_ctxid_to_string(
-                                context->names[i].osid, &ctx, &len)) {
+                                n->osid, &ctx, &len)) {
-                                audit_log_format(ab, " osid=%u",
+                                audit_log_format(ab, " osid=%u", n->osid);
-                                                context->names[i].osid);
                                call_panic = 2;
                        } else
                                audit_log_format(ab, " obj=%s", ctx);
@@ -908,11 +1072,11 @@ void audit_syscall_exit(int valid, long return_code)
 * Add a name to the list of audit names for this context.
 * Called from fs/namei.c:getname().
 */
-void audit_getname(const char *name)
+void __audit_getname(const char *name)
 {
        struct audit_context *context = current->audit_context;
-        if (!context || IS_ERR(name) || !name)
+        if (IS_ERR(name) || !name)
                return;
        if (!context->in_syscall) {
@@ -925,6 +1089,8 @@ void audit_getname(const char *name)
        }
        BUG_ON(context->name_count >= AUDIT_NAMES);
        context->names[context->name_count].name = name;
+        context->names[context->name_count].name_len = AUDIT_NAME_FULL;
+        context->names[context->name_count].name_put = 1;
        context->names[context->name_count].ino  = (unsigned long)-1;
        ++context->name_count;
        if (!context->pwd) {
@@ -991,11 +1157,10 @@ static void audit_inode_context(int idx, const struct inode *inode)
 * audit_inode - store the inode and device from a lookup
 * @name: name being audited
 * @inode: inode being audited
- * @flags: lookup flags (as used in path_lookup())
 *
 * Called from fs/namei.c:path_lookup().
 */
-void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
+void __audit_inode(const char *name, const struct inode *inode)
 {
        int idx;
        struct audit_context *context = current->audit_context;
@@ -1021,20 +1186,13 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
                ++context->ino_count;
 #endif
        }
+        context->names[idx].ino   = inode->i_ino;
        context->names[idx].dev   = inode->i_sb->s_dev;
        context->names[idx].mode  = inode->i_mode;
        context->names[idx].uid   = inode->i_uid;
        context->names[idx].gid   = inode->i_gid;
        context->names[idx].rdev  = inode->i_rdev;
        audit_inode_context(idx, inode);
-        if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && 
-            (strcmp(name, ".") != 0)) {
-                context->names[idx].ino   = (unsigned long)-1;
-                context->names[idx].pino  = inode->i_ino;
-        } else {
-                context->names[idx].ino   = inode->i_ino;
-                context->names[idx].pino  = (unsigned long)-1;
-        }
 }
 /**
@@ -1056,51 +1214,40 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
 {
        int idx;
        struct audit_context *context = current->audit_context;
+        const char *found_name = NULL;
+        int dirlen = 0;
        if (!context->in_syscall)
                return;
        /* determine matching parent */
-        if (dname)
+        if (!dname)
-                for (idx = 0; idx < context->name_count; idx++)
+                goto update_context;
-                        if (context->names[idx].pino == pino) {
+        for (idx = 0; idx < context->name_count; idx++)
-                                const char *n;
+                if (context->names[idx].ino == pino) {
-                                const char *name = context->names[idx].name;
+                        const char *name = context->names[idx].name;
-                                int dlen = strlen(dname);
-                                int nlen = name ? strlen(name) : 0;
+                        if (!name)
+                                continue;
-                                if (nlen < dlen)
-                                        continue;
+                        if (audit_compare_dname_path(dname, name, &dirlen) == 0) {
-                                
+                                context->names[idx].name_len = dirlen;
-                                /* disregard trailing slashes */
+                                found_name = name;
-                                n = name + nlen - 1;
+                                break;
-                                while ((*n == '/') && (n > name))
-                                        n--;
-                                /* find last path component */
-                                n = n - dlen + 1;
-                                if (n < name)
-                                        continue;
-                                else if (n > name) {
-                                        if (*--n != '/')
-                                                continue;
-                                        else
-                                                n++;
-                                }
-                                if (strncmp(n, dname, dlen) == 0)
-                                        goto update_context;
                        }
+                }
-        /* catch-all in case match not found */
+update_context:
        idx = context->name_count++;
-        context->names[idx].name  = NULL;
-        context->names[idx].pino  = pino;
 #if AUDIT_DEBUG
        context->ino_count++;
 #endif
+        /* Re-use the name belonging to the slot for a matching parent directory.
+         * All names for this context are relinquished in audit_free_names() */
+        context->names[idx].name = found_name;
+        context->names[idx].name_len = AUDIT_NAME_FULL;
+        context->names[idx].name_put = 0;       /* don't call __putname() */
-update_context:
        if (inode) {
                context->names[idx].ino   = inode->i_ino;
                context->names[idx].dev   = inode->i_sb->s_dev;
@@ -1109,7 +1256,8 @@ update_context:
                context->names[idx].gid   = inode->i_gid;
                context->names[idx].rdev  = inode->i_rdev;
                audit_inode_context(idx, inode);
-        }
+        } else
+                context->names[idx].ino   = (unsigned long)-1;
 }
 /**
@@ -1142,18 +1290,23 @@ void auditsc_get_stamp(struct audit_context *ctx,
 */
 int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 {
-        if (task->audit_context) {
+        struct audit_context *context = task->audit_context;
-                struct audit_buffer *ab;
+        if (context) {
-                ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+                /* Only log if audit is enabled */
-                if (ab) {
+                if (context->in_syscall) {
-                        audit_log_format(ab, "login pid=%d uid=%u "
+                        struct audit_buffer *ab;
-                                "old auid=%u new auid=%u",
-                                task->pid, task->uid, 
+                        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
-                                task->audit_context->loginuid, loginuid);
+                        if (ab) {
-                        audit_log_end(ab);
+                                audit_log_format(ab, "login pid=%d uid=%u "
+                                        "old auid=%u new auid=%u",
+                                        task->pid, task->uid,
+                                        context->loginuid, loginuid);
+                                audit_log_end(ab);
+                        }
                }
-                task->audit_context->loginuid = loginuid;
+                context->loginuid = loginuid;
        }
        return 0;
 }
@@ -1170,16 +1323,193 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
 }
 /**
- * audit_ipc_obj - record audit data for ipc object
+ * __audit_mq_open - record audit data for a POSIX MQ open
- * @ipcp: ipc permissions
+ * @oflag: open flag
+ * @mode: mode bits
+ * @u_attr: queue attributes
 *
 * Returns 0 for success or NULL context or < 0 on error.
 */
-int audit_ipc_obj(struct kern_ipc_perm *ipcp)
+int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
 {
-        struct audit_aux_data_ipcctl *ax;
+        struct audit_aux_data_mq_open *ax;
+        struct audit_context *context = current->audit_context;
+        if (!audit_enabled)
+                return 0;
+        if (likely(!context))
+                return 0;
+        ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+        if (!ax)
+                return -ENOMEM;
+        if (u_attr != NULL) {
+                if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
+                        kfree(ax);
+                        return -EFAULT;
+                }
+        } else
+                memset(&ax->attr, 0, sizeof(ax->attr));
+        ax->oflag = oflag;
+        ax->mode = mode;
+        ax->d.type = AUDIT_MQ_OPEN;
+        ax->d.next = context->aux;
+        context->aux = (void *)ax;
+        return 0;
+}
+/**
+ * __audit_mq_timedsend - record audit data for a POSIX MQ timed send
+ * @mqdes: MQ descriptor
+ * @msg_len: Message length
+ * @msg_prio: Message priority
+ * @abs_timeout: Message timeout in absolute time
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
+                        const struct timespec __user *u_abs_timeout)
+{
+        struct audit_aux_data_mq_sendrecv *ax;
+        struct audit_context *context = current->audit_context;
+        if (!audit_enabled)
+                return 0;
+        if (likely(!context))
+                return 0;
+        ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+        if (!ax)
+                return -ENOMEM;
+        if (u_abs_timeout != NULL) {
+                if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
+                        kfree(ax);
+                        return -EFAULT;
+                }
+        } else
+                memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
+        ax->mqdes = mqdes;
+        ax->msg_len = msg_len;
+        ax->msg_prio = msg_prio;
+        ax->d.type = AUDIT_MQ_SENDRECV;
+        ax->d.next = context->aux;
+        context->aux = (void *)ax;
+        return 0;
+}
+/**
+ * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
+ * @mqdes: MQ descriptor
+ * @msg_len: Message length
+ * @msg_prio: Message priority
+ * @abs_timeout: Message timeout in absolute time
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
+                                unsigned int __user *u_msg_prio,
+                                const struct timespec __user *u_abs_timeout)
+{
+        struct audit_aux_data_mq_sendrecv *ax;
+        struct audit_context *context = current->audit_context;
+        if (!audit_enabled)
+                return 0;
+        if (likely(!context))
+                return 0;
+        ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+        if (!ax)
+                return -ENOMEM;
+        if (u_msg_prio != NULL) {
+                if (get_user(ax->msg_prio, u_msg_prio)) {
+                        kfree(ax);
+                        return -EFAULT;
+                }
+        } else
+                ax->msg_prio = 0;
+        if (u_abs_timeout != NULL) {
+                if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
+                        kfree(ax);
+                        return -EFAULT;
+                }
+        } else
+                memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
+        ax->mqdes = mqdes;
+        ax->msg_len = msg_len;
+        ax->d.type = AUDIT_MQ_SENDRECV;
+        ax->d.next = context->aux;
+        context->aux = (void *)ax;
+        return 0;
+}
+/**
+ * __audit_mq_notify - record audit data for a POSIX MQ notify
+ * @mqdes: MQ descriptor
+ * @u_notification: Notification event
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
+{
+        struct audit_aux_data_mq_notify *ax;
+        struct audit_context *context = current->audit_context;
+        if (!audit_enabled)
+                return 0;
+        if (likely(!context))
+                return 0;
+        ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+        if (!ax)
+                return -ENOMEM;
+        if (u_notification != NULL) {
+                if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
+                        kfree(ax);
+                        return -EFAULT;
+                }
+        } else
+                memset(&ax->notification, 0, sizeof(ax->notification));
+        ax->mqdes = mqdes;
+        ax->d.type = AUDIT_MQ_NOTIFY;
+        ax->d.next = context->aux;
+        context->aux = (void *)ax;
+        return 0;
+}
+/**
+ * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute
+ * @mqdes: MQ descriptor
+ * @mqstat: MQ flags
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
+{
+        struct audit_aux_data_mq_getsetattr *ax;
        struct audit_context *context = current->audit_context;
+        if (!audit_enabled)
+                return 0;
        if (likely(!context))
                return 0;
@@ -1187,6 +1517,30 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
        if (!ax)
                return -ENOMEM;
+        ax->mqdes = mqdes;
+        ax->mqstat = *mqstat;
+        ax->d.type = AUDIT_MQ_GETSETATTR;
+        ax->d.next = context->aux;
+        context->aux = (void *)ax;
+        return 0;
+}
+/**
+ * audit_ipc_obj - record audit data for ipc object
+ * @ipcp: ipc permissions
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
+{
+        struct audit_aux_data_ipcctl *ax;
+        struct audit_context *context = current->audit_context;
+        ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+        if (!ax)
+                return -ENOMEM;
        ax->uid = ipcp->uid;
        ax->gid = ipcp->gid;
        ax->mode = ipcp->mode;
@@ -1204,17 +1558,15 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
 * @uid: msgq user id
 * @gid: msgq group id
 * @mode: msgq mode (permissions)
+ * @ipcp: in-kernel IPC permissions
 *
 * Returns 0 for success or NULL context or < 0 on error.
 */
-int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
+int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 {
        struct audit_aux_data_ipcctl *ax;
        struct audit_context *context = current->audit_context;
-        if (likely(!context))
-                return 0;
        ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
        if (!ax)
                return -ENOMEM;
@@ -1223,7 +1575,6 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
        ax->uid = uid;
        ax->gid = gid;
        ax->mode = mode;
-        selinux_get_ipc_sid(ipcp, &ax->osid);
        ax->d.type = AUDIT_IPC_SET_PERM;
        ax->d.next = context->aux;
@@ -1231,6 +1582,39 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
        return 0;
 }
+int audit_bprm(struct linux_binprm *bprm)
+{
+        struct audit_aux_data_execve *ax;
+        struct audit_context *context = current->audit_context;
+        unsigned long p, next;
+        void *to;
+        if (likely(!audit_enabled || !context))
+                return 0;
+        ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
+                                GFP_KERNEL);
+        if (!ax)
+                return -ENOMEM;
+        ax->argc = bprm->argc;
+        ax->envc = bprm->envc;
+        for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) {
+                struct page *page = bprm->page[p / PAGE_SIZE];
+                void *kaddr = kmap(page);
+                next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1);
+                memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p);
+                to += next - p;
+                kunmap(page);
+        }
+        ax->d.type = AUDIT_EXECVE;
+        ax->d.next = context->aux;
+        context->aux = (void *)ax;
+        return 0;
+}
 /**
 * audit_socketcall - record audit data for sys_socketcall
 * @nargs: number of args
@@ -1325,19 +1709,20 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
 * If the audit subsystem is being terminated, record the task (pid)
 * and uid that is doing that.
 */
-void audit_signal_info(int sig, struct task_struct *t)
+void __audit_signal_info(int sig, struct task_struct *t)
 {
        extern pid_t audit_sig_pid;
        extern uid_t audit_sig_uid;
+        extern u32 audit_sig_sid;
-        if (unlikely(audit_pid && t->tgid == audit_pid)) {
-                if (sig == SIGTERM || sig == SIGHUP) {
+        if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
-                        struct audit_context *ctx = current->audit_context;
+                struct task_struct *tsk = current;
-                        audit_sig_pid = current->pid;
+                struct audit_context *ctx = tsk->audit_context;
-                        if (ctx)
+                audit_sig_pid = tsk->pid;
-                                audit_sig_uid = ctx->loginuid;
+                if (ctx)
-                        else
+                        audit_sig_uid = ctx->loginuid;
-                                audit_sig_uid = current->uid;
+                else
-                }
+                        audit_sig_uid = tsk->uid;
+                selinux_get_task_sid(tsk, &audit_sig_sid);
        }
 }
diff --git a/kernel/compat.c b/kernel/compat.c
index c1601a84f8d8..126dee9530aa 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
 #include <linux/unistd.h>
 #include <linux/security.h>
 #include <linux/timex.h>
+#include <linux/migrate.h>
 #include <asm/uaccess.h>
@@ -729,17 +730,10 @@ void
 sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
 {
        switch (_NSIG_WORDS) {
-#if defined (__COMPAT_ENDIAN_SWAP__)
-        case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 );
-        case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 );
-        case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 );
-        case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 );
-#else
        case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
        case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
        case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
        case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
-#endif
        }
 }
@@ -934,3 +928,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
        return ret;
 }
+#ifdef CONFIG_NUMA
+asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
+                compat_uptr_t __user *pages32,
+                const int __user *nodes,
+                int __user *status,
+                int flags)
+{
+        const void __user * __user *pages;
+        int i;
+        pages = compat_alloc_user_space(nr_pages * sizeof(void *));
+        for (i = 0; i < nr_pages; i++) {
+                compat_uptr_t p;
+                if (get_user(p, pages32 + i) ||
+                        put_user(compat_ptr(p), pages + i))
+                        return -EFAULT;
+        }
+        return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
+}
+#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index fe2b8d0bfe4c..03dcd981846a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -13,10 +13,10 @@
 #include <linux/module.h>
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 /* This protects CPUs going up and down... */
-static DECLARE_MUTEX(cpucontrol);
+static DEFINE_MUTEX(cpucontrol);
 static BLOCKING_NOTIFIER_HEAD(cpu_chain);
@@ -30,9 +30,9 @@ static int __lock_cpu_hotplug(int interruptible)
        if (lock_cpu_hotplug_owner != current) {
                if (interruptible)
-                        ret = down_interruptible(&cpucontrol);
+                        ret = mutex_lock_interruptible(&cpucontrol);
                else
-                        down(&cpucontrol);
+                        mutex_lock(&cpucontrol);
        }
        /*
@@ -56,7 +56,7 @@ void unlock_cpu_hotplug(void)
 {
        if (--lock_cpu_hotplug_depth == 0) {
                lock_cpu_hotplug_owner = NULL;
-                up(&cpucontrol);
+                mutex_unlock(&cpucontrol);
        }
 }
 EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ab81fdd4572b..1535af3a912d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -41,6 +41,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
+#include <linux/security.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/spinlock.h>
@@ -392,11 +393,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
        return 0;
 }
-static struct super_block *cpuset_get_sb(struct file_system_type *fs_type,
+static int cpuset_get_sb(struct file_system_type *fs_type,
-                                        int flags, const char *unused_dev_name,
+                         int flags, const char *unused_dev_name,
-                                        void *data)
+                         void *data, struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, data, cpuset_fill_super);
+        return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt);
 }
 static struct file_system_type cpuset_fs_type = {
@@ -1177,6 +1178,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
        cpumask_t cpus;
        nodemask_t from, to;
        struct mm_struct *mm;
+        int retval;
        if (sscanf(pidbuf, "%d", &pid) != 1)
                return -EIO;
@@ -1205,6 +1207,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
                get_task_struct(tsk);
        }
+        retval = security_task_setscheduler(tsk, 0, NULL);
+        if (retval) {
+                put_task_struct(tsk);
+                return retval;
+        }
        mutex_lock(&callback_mutex);
        task_lock(tsk);
@@ -2434,31 +2442,43 @@ void __cpuset_memory_pressure_bump(void)
 */
 static int proc_cpuset_show(struct seq_file *m, void *v)
 {
+        struct pid *pid;
        struct task_struct *tsk;
        char *buf;
-        int retval = 0;
+        int retval;
+        retval = -ENOMEM;
        buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!buf)
-                return -ENOMEM;
+                goto out;
-        tsk = m->private;
+        retval = -ESRCH;
+        pid = m->private;
+        tsk = get_pid_task(pid, PIDTYPE_PID);
+        if (!tsk)
+                goto out_free;
+        retval = -EINVAL;
        mutex_lock(&manage_mutex);
        retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
        if (retval < 0)
-                goto out;
+                goto out_unlock;
        seq_puts(m, buf);
        seq_putc(m, '\n');
-out:
+out_unlock:
        mutex_unlock(&manage_mutex);
+        put_task_struct(tsk);
+out_free:
        kfree(buf);
+out:
        return retval;
 }
 static int cpuset_open(struct inode *inode, struct file *file)
 {
-        struct task_struct *tsk = PROC_I(inode)->task;
+        struct pid *pid = PROC_I(inode)->pid;
-        return single_open(file, proc_cpuset_show, tsk);
+        return single_open(file, proc_cpuset_show, pid);
 }
 struct file_operations proc_cpuset_operations = {
diff --git a/kernel/exit.c b/kernel/exit.c
index e95b93282210..304ef637be6c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -36,6 +36,7 @@
 #include <linux/compat.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/audit.h> /* for audit_free() */
+#include <linux/resource.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -45,8 +46,6 @@
 extern void sem_exit (void);
 extern struct task_struct *child_reaper;
-int getrusage(struct task_struct *, int, struct rusage __user *);
 static void exit_mm(struct task_struct * tsk);
 static void __unhash_process(struct task_struct *p)
@@ -138,12 +137,8 @@ void release_task(struct task_struct * p)
 {
        int zap_leader;
        task_t *leader;
-        struct dentry *proc_dentry;
 repeat:
        atomic_dec(&p->user->processes);
-        spin_lock(&p->proc_lock);
-        proc_dentry = proc_pid_unhash(p);
        write_lock_irq(&tasklist_lock);
        ptrace_unlink(p);
        BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
@@ -172,8 +167,7 @@ repeat:
        sched_exit(p);
        write_unlock_irq(&tasklist_lock);
-        spin_unlock(&p->proc_lock);
+        proc_flush_task(p);
-        proc_pid_flush(proc_dentry);
        release_thread(p);
        call_rcu(&p->rcu, delayed_put_task_struct);
@@ -579,7 +573,7 @@ static void exit_mm(struct task_struct * tsk)
                down_read(&mm->mmap_sem);
        }
        atomic_inc(&mm->mm_count);
-        if (mm != tsk->active_mm) BUG();
+        BUG_ON(mm != tsk->active_mm);
        /* more a memory barrier than a real lock */
        task_lock(tsk);
        tsk->mm = NULL;
@@ -881,14 +875,6 @@ fastcall NORET_TYPE void do_exit(long code)
        tsk->flags |= PF_EXITING;
-        /*
-         * Make sure we don't try to process any timer firings
-         * while we are already exiting.
-         */
-        tsk->it_virt_expires = cputime_zero;
-        tsk->it_prof_expires = cputime_zero;
-        tsk->it_sched_expires = 0;
        if (unlikely(in_atomic()))
                printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
                                current->comm, current->pid,
@@ -903,11 +889,11 @@ fastcall NORET_TYPE void do_exit(long code)
        if (group_dead) {
                hrtimer_cancel(&tsk->signal->real_timer);
                exit_itimers(tsk->signal);
-                acct_process(code);
        }
+        acct_collect(code, group_dead);
        if (unlikely(tsk->robust_list))
                exit_robust_list(tsk);
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
        if (unlikely(tsk->compat_robust_list))
                compat_exit_robust_list(tsk);
 #endif
@@ -915,6 +901,8 @@ fastcall NORET_TYPE void do_exit(long code)
                audit_free(tsk);
        exit_mm(tsk);
+        if (group_dead)
+                acct_process();
        exit_sem(tsk);
        __exit_files(tsk);
        __exit_fs(tsk);
@@ -1538,8 +1526,7 @@ check_continued:
                if (options & __WNOTHREAD)
                        break;
                tsk = next_thread(tsk);
-                if (tsk->signal != current->signal)
+                BUG_ON(tsk->signal != current->signal);
-                        BUG();
        } while (tsk != current);
        read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index ac8100e3088a..9b4e54ef0225 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -368,6 +368,8 @@ void fastcall __mmdrop(struct mm_struct *mm)
 */
 void mmput(struct mm_struct *mm)
 {
+        might_sleep();
        if (atomic_dec_and_test(&mm->mm_users)) {
                exit_aio(mm);
                exit_mmap(mm);
@@ -623,6 +625,7 @@ out:
 /*
 * Allocate a new files structure and copy contents from the
 * passed in files structure.
+ * errorp will be valid only when the returned files_struct is NULL.
 */
 static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 {
@@ -631,6 +634,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
        int open_files, size, i, expand;
        struct fdtable *old_fdt, *new_fdt;
+        *errorp = -ENOMEM;
        newf = alloc_files();
        if (!newf)
                goto out;
@@ -744,7 +748,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
         * break this.
         */
        tsk->files = NULL;
-        error = -ENOMEM;
        newf = dup_fd(oldf, &error);
        if (!newf)
                goto out;
@@ -871,6 +874,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
                tsk->it_prof_expires =
                        secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
        }
+        acct_init_pacct(&sig->pacct);
        return 0;
 }
@@ -989,13 +993,10 @@ static task_t *copy_process(unsigned long clone_flags,
                if (put_user(p->pid, parent_tidptr))
                        goto bad_fork_cleanup;
-        p->proc_dentry = NULL;
        INIT_LIST_HEAD(&p->children);
        INIT_LIST_HEAD(&p->sibling);
        p->vfork_done = NULL;
        spin_lock_init(&p->alloc_lock);
-        spin_lock_init(&p->proc_lock);
        clear_tsk_thread_flag(p, TIF_SIGPENDING);
        init_sigpending(&p->pending);
@@ -1155,18 +1156,6 @@ static task_t *copy_process(unsigned long clone_flags,
        }
        if (clone_flags & CLONE_THREAD) {
-                /*
-                 * Important: if an exit-all has been started then
-                 * do not create this new thread - the whole thread
-                 * group is supposed to exit anyway.
-                 */
-                if (current->signal->flags & SIGNAL_GROUP_EXIT) {
-                        spin_unlock(&current->sighand->siglock);
-                        write_unlock_irq(&tasklist_lock);
-                        retval = -EAGAIN;
-                        goto bad_fork_cleanup_namespace;
-                }
                p->group_leader = current->group_leader;
                list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5699c512057b..e1a380c77a5a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1056,11 +1056,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
                        (unsigned long)uaddr2, val2, val3);
 }
-static struct super_block *
+static int futexfs_get_sb(struct file_system_type *fs_type,
-futexfs_get_sb(struct file_system_type *fs_type,
+                          int flags, const char *dev_name, void *data,
-               int flags, const char *dev_name, void *data)
+                          struct vfsmount *mnt)
 {
-        return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA);
+        return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt);
 }
 static struct file_system_type futex_fs_type = {
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 01fa2ae98a85..55601b3ce60e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
 /**
 * ktime_get_ts - get the monotonic clock in timespec format
- *
 * @ts:         pointer to timespec variable
 *
 * The function calculates the monotonic clock from the realtime
@@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 # ifndef CONFIG_KTIME_SCALAR
 /**
 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
- *
 * @kt:         addend
 * @nsec:       the scalar nsec value to add
 *
@@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 /**
 * hrtimer_forward - forward the timer expiry
- *
 * @timer:      hrtimer to forward
 * @now:        forward past this time
 * @interval:   the interval to forward
@@ -393,7 +390,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
        if (base->first == &timer->node)
                base->first = rb_next(&timer->node);
        rb_erase(&timer->node, &base->active);
-        timer->node.rb_parent = HRTIMER_INACTIVE;
+        rb_set_parent(&timer->node, &timer->node);
 }
 /*
@@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 /**
 * hrtimer_start - (re)start an relative timer on the current CPU
- *
 * @timer:      the timer to be added
 * @tim:        expiry time
 * @mode:       expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
@@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
 /**
 * hrtimer_try_to_cancel - try to deactivate a timer
- *
 * @timer:      hrtimer to stop
 *
 * Returns:
 *  0 when the timer was not active
 *  1 when the timer was active
 * -1 when the timer is currently excuting the callback function and
- *    can not be stopped
+ *    cannot be stopped
 */
 int hrtimer_try_to_cancel(struct hrtimer *timer)
 {
@@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
 /**
 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
- *
 * @timer:      the timer to be cancelled
 *
 * Returns:
@@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
 /**
 * hrtimer_get_remaining - get remaining time for the timer
- *
 * @timer:      the timer to read
 */
 ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
@@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void)
 /**
 * hrtimer_init - initialize a timer to the given clock
- *
 * @timer:      the timer to be initialized
 * @clock_id:   the clock to be used
 * @mode:       timer mode abs/rel
@@ -576,19 +568,18 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
        memset(timer, 0, sizeof(struct hrtimer));
-        bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
+        bases = __raw_get_cpu_var(hrtimer_bases);
        if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
                clock_id = CLOCK_MONOTONIC;
        timer->base = &bases[clock_id];
-        timer->node.rb_parent = HRTIMER_INACTIVE;
+        rb_set_parent(&timer->node, &timer->node);
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 /**
 * hrtimer_get_res - get the timer resolution for a clock
- *
 * @which_clock: which clock to query
 * @tp:          pointer to timespec variable to store the resolution
 *
@@ -599,7 +590,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 {
        struct hrtimer_base *bases;
-        bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
+        bases = __raw_get_cpu_var(hrtimer_bases);
        *tp = ktime_to_timespec(bases[which_clock].resolution);
        return 0;
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
deleted file mode 100644
index 55b1e5b85db9..000000000000
--- a/kernel/intermodule.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Deprecated, do not use.  Moved from module.c to here. --RR */
-/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */
-#include <linux/module.h>
-#include <linux/kmod.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-/* inter_module functions are always available, even when the kernel is
- * compiled without modules.  Consumers of inter_module_xxx routines
- * will always work, even when both are built into the kernel, this
- * approach removes lots of #ifdefs in mainline code.
- */
-static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
-static DEFINE_SPINLOCK(ime_lock);
-static int kmalloc_failed;
-struct inter_module_entry {
-        struct list_head list;
-        const char *im_name;
-        struct module *owner;
-        const void *userdata;
-};
-/**
- * inter_module_register - register a new set of inter module data.
- * @im_name: an arbitrary string to identify the data, must be unique
- * @owner: module that is registering the data, always use THIS_MODULE
- * @userdata: pointer to arbitrary userdata to be registered
- *
- * Description: Check that the im_name has not already been registered,
- * complain if it has.  For new data, add it to the inter_module_entry
- * list.
- */
-void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
-{
-        struct list_head *tmp;
-        struct inter_module_entry *ime, *ime_new;
-        if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
-                /* Overloaded kernel, not fatal */
-                printk(KERN_ERR
-                        "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
-                        im_name);
-                kmalloc_failed = 1;
-                return;
-        }
-        ime_new->im_name = im_name;
-        ime_new->owner = owner;
-        ime_new->userdata = userdata;
-        spin_lock(&ime_lock);
-        list_for_each(tmp, &ime_list) {
-                ime = list_entry(tmp, struct inter_module_entry, list);
-                if (strcmp(ime->im_name, im_name) == 0) {
-                        spin_unlock(&ime_lock);
-                        kfree(ime_new);
-                        /* Program logic error, fatal */
-                        printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
-                        BUG();
-                }
-        }
-        list_add(&(ime_new->list), &ime_list);
-        spin_unlock(&ime_lock);
-}
-/**
- * inter_module_unregister - unregister a set of inter module data.
- * @im_name: an arbitrary string to identify the data, must be unique
- *
- * Description: Check that the im_name has been registered, complain if
- * it has not.  For existing data, remove it from the
- * inter_module_entry list.
- */
-void inter_module_unregister(const char *im_name)
-{
-        struct list_head *tmp;
-        struct inter_module_entry *ime;
-        spin_lock(&ime_lock);
-        list_for_each(tmp, &ime_list) {
-                ime = list_entry(tmp, struct inter_module_entry, list);
-                if (strcmp(ime->im_name, im_name) == 0) {
-                        list_del(&(ime->list));
-                        spin_unlock(&ime_lock);
-                        kfree(ime);
-                        return;
-                }
-        }
-        spin_unlock(&ime_lock);
-        if (kmalloc_failed) {
-                printk(KERN_ERR
-                        "inter_module_unregister: no entry for '%s', "
-                        "probably caused by previous kmalloc failure\n",
-                        im_name);
-                return;
-        }
-        else {
-                /* Program logic error, fatal */
-                printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
-                BUG();
-        }
-}
-/**
- * inter_module_get - return arbitrary userdata from another module.
- * @im_name: an arbitrary string to identify the data, must be unique
- *
- * Description: If the im_name has not been registered, return NULL.
- * Try to increment the use count on the owning module, if that fails
- * then return NULL.  Otherwise return the userdata.
- */
-static const void *inter_module_get(const char *im_name)
-{
-        struct list_head *tmp;
-        struct inter_module_entry *ime;
-        const void *result = NULL;
-        spin_lock(&ime_lock);
-        list_for_each(tmp, &ime_list) {
-                ime = list_entry(tmp, struct inter_module_entry, list);
-                if (strcmp(ime->im_name, im_name) == 0) {
-                        if (try_module_get(ime->owner))
-                                result = ime->userdata;
-                        break;
-                }
-        }
-        spin_unlock(&ime_lock);
-        return(result);
-}
-/**
- * inter_module_get_request - im get with automatic request_module.
- * @im_name: an arbitrary string to identify the data, must be unique
- * @modname: module that is expected to register im_name
- *
- * Description: If inter_module_get fails, do request_module then retry.
- */
-const void *inter_module_get_request(const char *im_name, const char *modname)
-{
-        const void *result = inter_module_get(im_name);
-        if (!result) {
-                request_module("%s", modname);
-                result = inter_module_get(im_name);
-        }
-        return(result);
-}
-/**
- * inter_module_put - release use of data from another module.
- * @im_name: an arbitrary string to identify the data, must be unique
- *
- * Description: If the im_name has not been registered, complain,
- * otherwise decrement the use count on the owning module.
- */
-void inter_module_put(const char *im_name)
-{
-        struct list_head *tmp;
-        struct inter_module_entry *ime;
-        spin_lock(&ime_lock);
-        list_for_each(tmp, &ime_list) {
-                ime = list_entry(tmp, struct inter_module_entry, list);
-                if (strcmp(ime->im_name, im_name) == 0) {
-                        if (ime->owner)
-                                module_put(ime->owner);
-                        spin_unlock(&ime_lock);
-                        return;
-                }
-        }
-        spin_unlock(&ime_lock);
-        printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
-        BUG();
-}
-EXPORT_SYMBOL(inter_module_register);
-EXPORT_SYMBOL(inter_module_unregister);
-EXPORT_SYMBOL(inter_module_get_request);
-EXPORT_SYMBOL(inter_module_put);
-MODULE_LICENSE("GPL");
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 51df337b37db..0f6530117105 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -76,10 +76,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
 /*
 * Have got an event to handle:
 */
-fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
                                struct irqaction *action)
 {
-        int ret, retval = 0, status = 0;
+        irqreturn_t ret, retval = IRQ_NONE;
+        unsigned int status = 0;
        if (!(action->flags & SA_INTERRUPT))
                local_irq_enable();
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 134f9f2e0e39..a12d00eb5e7c 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -30,7 +30,7 @@ void move_native_irq(int irq)
        desc->move_irq = 0;
-        if (likely(cpus_empty(pending_irq_cpumask[irq])))
+        if (unlikely(cpus_empty(pending_irq_cpumask[irq])))
                return;
        if (!desc->handler->set_affinity)
@@ -49,7 +49,7 @@ void move_native_irq(int irq)
         * cause some ioapics to mal-function.
         * Being paranoid i guess!
         */
-        if (unlikely(!cpus_empty(tmp))) {
+        if (likely(!cpus_empty(tmp))) {
                if (likely(!(desc->status & IRQ_DISABLED)))
                        desc->handler->disable(irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d03b5eef8ce0..afacd6f585fa 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -24,6 +24,8 @@ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
+        set_balance_irq_affinity(irq, mask_val);
        /*
         * Save these away for later use. Re-progam when the
         * interrupt is pending
@@ -33,6 +35,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 #else
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
+        set_balance_irq_affinity(irq, mask_val);
        irq_affinity[irq] = mask_val;
        irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 7df9abd5ec86..b2fb3c18d06b 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -11,7 +11,7 @@
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
-static int irqfixup;
+static int irqfixup __read_mostly;
 /*
 * Recovery handler for misrouted interrupts.
@@ -136,9 +136,9 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio
 void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
                        struct pt_regs *regs)
 {
-        if (action_ret != IRQ_HANDLED) {
+        if (unlikely(action_ret != IRQ_HANDLED)) {
                desc->irqs_unhandled++;
-                if (action_ret != IRQ_NONE)
+                if (unlikely(action_ret != IRQ_NONE))
                        report_bad_irq(irq, desc, action_ret);
        }
@@ -152,11 +152,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
        }
        desc->irq_count++;
-        if (desc->irq_count < 100000)
+        if (likely(desc->irq_count < 100000))
                return;
        desc->irq_count = 0;
-        if (desc->irqs_unhandled > 99900) {
+        if (unlikely(desc->irqs_unhandled > 99900)) {
                /*
                 * The interrupt is stuck
                 */
@@ -171,7 +171,7 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
        desc->irqs_unhandled = 0;
 }
-int noirqdebug;
+int noirqdebug __read_mostly;
 int __init noirqdebug_setup(char *str)
 {
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bf39d28e4c0e..58f0f382597c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image,
 * kexec does not sync, or unmount filesystems so if you need
 * that to happen you need to do that yourself.
 */
-struct kimage *kexec_image = NULL;
+struct kimage *kexec_image;
-static struct kimage *kexec_crash_image = NULL;
+struct kimage *kexec_crash_image;
 /*
 * A home grown binary mutex.
 * Nothing can wait so this mutex is safe to use
 * in interrupt context :)
 */
-static int kexec_lock = 0;
+static int kexec_lock;
 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
                                struct kexec_segment __user *segments,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1fbf466a29aa..64aab081153b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,11 +47,17 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
+static atomic_t kprobe_count;
 DEFINE_MUTEX(kprobe_mutex);             /* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);        /* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+static struct notifier_block kprobe_page_fault_nb = {
+        .notifier_call = kprobe_exceptions_notify,
+        .priority = 0x7fffffff /* we need to notified first */
+};
 #ifdef __ARCH_WANT_KPROBES_INSN_SLOT
 /*
 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 */
 static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
 {
-        struct kprobe *kp;
        if (p->break_handler) {
-                list_for_each_entry_rcu(kp, &old_p->list, list) {
+                if (old_p->break_handler)
-                        if (kp->break_handler)
+                        return -EEXIST;
-                                return -EEXIST;
-                }
                list_add_tail_rcu(&p->list, &old_p->list);
+                old_p->break_handler = aggr_break_handler;
        } else
                list_add_rcu(&p->list, &old_p->list);
+        if (p->post_handler && !old_p->post_handler)
+                old_p->post_handler = aggr_post_handler;
        return 0;
 }
@@ -390,9 +395,11 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
        copy_kprobe(p, ap);
        ap->addr = p->addr;
        ap->pre_handler = aggr_pre_handler;
-        ap->post_handler = aggr_post_handler;
        ap->fault_handler = aggr_fault_handler;
-        ap->break_handler = aggr_break_handler;
+        if (p->post_handler)
+                ap->post_handler = aggr_post_handler;
+        if (p->break_handler)
+                ap->break_handler = aggr_break_handler;
        INIT_LIST_HEAD(&ap->list);
        list_add_rcu(&p->list, &ap->list);
@@ -464,6 +471,8 @@ static int __kprobes __register_kprobe(struct kprobe *p,
        old_p = get_kprobe(p->addr);
        if (old_p) {
                ret = register_aggr_kprobe(old_p, p);
+                if (!ret)
+                        atomic_inc(&kprobe_count);
                goto out;
        }
@@ -474,6 +483,10 @@ static int __kprobes __register_kprobe(struct kprobe *p,
        hlist_add_head_rcu(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+        if (atomic_add_return(1, &kprobe_count) == \
+                                (ARCH_INACTIVE_KPROBE_COUNT + 1))
+                register_page_fault_notifier(&kprobe_page_fault_nb);
        arch_arm_kprobe(p);
 out:
@@ -536,14 +549,40 @@ valid_p:
                        kfree(old_p);
                }
                arch_remove_kprobe(p);
+        } else {
+                mutex_lock(&kprobe_mutex);
+                if (p->break_handler)
+                        old_p->break_handler = NULL;
+                if (p->post_handler){
+                        list_for_each_entry_rcu(list_p, &old_p->list, list){
+                                if (list_p->post_handler){
+                                        cleanup_p = 2;
+                                        break;
+                                }
+                        }
+                        if (cleanup_p == 0)
+                                old_p->post_handler = NULL;
+                }
+                mutex_unlock(&kprobe_mutex);
        }
+        /* Call unregister_page_fault_notifier()
+         * if no probes are active
+         */
+        mutex_lock(&kprobe_mutex);
+        if (atomic_add_return(-1, &kprobe_count) == \
+                                ARCH_INACTIVE_KPROBE_COUNT)
+                unregister_page_fault_notifier(&kprobe_page_fault_nb);
+        mutex_unlock(&kprobe_mutex);
+        return;
 }
 static struct notifier_block kprobe_exceptions_nb = {
        .notifier_call = kprobe_exceptions_notify,
-        .priority = 0x7fffffff /* we need to notified first */
+        .priority = 0x7fffffff /* we need to be notified first */
 };
 int __kprobes register_jprobe(struct jprobe *jp)
 {
        /* Todo: Verify probepoint is a function entry point */
@@ -652,6 +691,7 @@ static int __init init_kprobes(void)
                INIT_HLIST_HEAD(&kprobe_table[i]);
                INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
        }
+        atomic_set(&kprobe_count, 0);
        err = arch_init_kprobes();
        if (!err)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index f119e098e67b..9e28478a17a5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
 #include <linux/sysfs.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/kexec.h>
 #define KERNEL_ATTR_RO(_name) \
 static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -48,6 +49,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s
 KERNEL_ATTR_RW(uevent_helper);
 #endif
+#ifdef CONFIG_KEXEC
+static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page)
+{
+        return sprintf(page, "%d\n", !!kexec_image);
+}
+KERNEL_ATTR_RO(kexec_loaded);
+static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page)
+{
+        return sprintf(page, "%d\n", !!kexec_crash_image);
+}
+KERNEL_ATTR_RO(kexec_crash_loaded);
+#endif /* CONFIG_KEXEC */
 decl_subsys(kernel, NULL, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);
@@ -56,6 +71,10 @@ static struct attribute * kernel_attrs[] = {
        &uevent_seqnum_attr.attr,
        &uevent_helper_attr.attr,
 #endif
+#ifdef CONFIG_KEXEC
+        &kexec_loaded_attr.attr,
+        &kexec_crash_loaded_attr.attr,
+#endif
        NULL
 };
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c5f3c6613b6d..24be714b04c7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -45,6 +45,13 @@ struct kthread_stop_info
 static DEFINE_MUTEX(kthread_stop_lock);
 static struct kthread_stop_info kthread_stop_info;
+/**
+ * kthread_should_stop - should this kthread return now?
+ *
+ * When someone calls kthread_stop on your kthread, it will be woken
+ * and this will return true.  You should then return, and your return
+ * value will be passed through to kthread_stop().
+ */
 int kthread_should_stop(void)
 {
        return (kthread_stop_info.k == current);
@@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create)
        complete(&create->done);
 }
+/**
+ * kthread_create - create a kthread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: This helper function creates and names a kernel
+ * thread.  The thread will be stopped: use wake_up_process() to start
+ * it.  See also kthread_run(), kthread_create_on_cpu().
+ *
+ * When woken, the thread will run @threadfn() with @data as its
+ * argument. @threadfn can either call do_exit() directly if it is a
+ * standalone thread for which noone will call kthread_stop(), or
+ * return when 'kthread_should_stop()' is true (which means
+ * kthread_stop() has been called).  The return value should be zero
+ * or a negative error number; it will be passed to kthread_stop().
+ *
+ * Returns a task_struct or ERR_PTR(-ENOMEM).
+ */
 struct task_struct *kthread_create(int (*threadfn)(void *data),
                                   void *data,
                                   const char namefmt[],
@@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create);
+/**
+ * kthread_bind - bind a just-created kthread to a cpu.
+ * @k: thread created by kthread_create().
+ * @cpu: cpu (might not be online, must be possible) for @k to run on.
+ *
+ * Description: This function is equivalent to set_cpus_allowed(),
+ * except that @cpu doesn't need to be online, and the thread must be
+ * stopped (i.e., just returned from kthread_create().
+ */
 void kthread_bind(struct task_struct *k, unsigned int cpu)
 {
        BUG_ON(k->state != TASK_INTERRUPTIBLE);
@@ -166,12 +201,36 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 }
 EXPORT_SYMBOL(kthread_bind);
+/**
+ * kthread_stop - stop a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_stop() for @k to return true, wakes it, and
+ * waits for it to exit.  Your threadfn() must not call do_exit()
+ * itself if you use this function!  This can also be called after
+ * kthread_create() instead of calling wake_up_process(): the thread
+ * will exit without calling threadfn().
+ *
+ * Returns the result of threadfn(), or %-EINTR if wake_up_process()
+ * was never called.
+ */
 int kthread_stop(struct task_struct *k)
 {
        return kthread_stop_sem(k, NULL);
 }
 EXPORT_SYMBOL(kthread_stop);
+/**
+ * kthread_stop_sem - stop a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ * @s: semaphore that @k waits on while idle.
+ *
+ * Does essentially the same thing as kthread_stop() above, but wakes
+ * @k by calling up(@s).
+ *
+ * Returns the result of threadfn(), or %-EINTR if wake_up_process()
+ * was never called.
+ */
 int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 {
        int ret;
@@ -210,5 +269,5 @@ static __init int helper_init(void)
        return 0;
 }
-core_initcall(helper_init);
+core_initcall(helper_init);
diff --git a/kernel/module.c b/kernel/module.c
index 690381508d09..10e5b872adf6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -40,6 +40,7 @@
 #include <linux/string.h>
 #include <linux/sched.h>
 #include <linux/mutex.h>
+#include <linux/unwind.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
@@ -1052,6 +1053,8 @@ static void free_module(struct module *mod)
        remove_sect_attrs(mod);
        mod_kobject_remove(mod);
+        unwind_remove_table(mod->unwind_info, 0);
        /* Arch-specific cleanup. */
        module_arch_cleanup(mod);
@@ -1317,7 +1320,7 @@ int is_exported(const char *name, const struct module *mod)
        if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
                return 1;
        else
-                if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
+                if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
                        return 1;
                else
                        return 0;
@@ -1403,7 +1406,7 @@ static struct module *load_module(void __user *umod,
        unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
                exportindex, modindex, obsparmindex, infoindex, gplindex,
                crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
-                gplfuturecrcindex;
+                gplfuturecrcindex, unwindex = 0;
        struct module *mod;
        long err = 0;
        void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1493,6 +1496,9 @@ static struct module *load_module(void __user *umod,
        versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
        infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
        pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
+#ifdef ARCH_UNWIND_SECTION_NAME
+        unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
+#endif
        /* Don't keep modinfo section */
        sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1501,6 +1507,8 @@ static struct module *load_module(void __user *umod,
        sechdrs[symindex].sh_flags |= SHF_ALLOC;
        sechdrs[strindex].sh_flags |= SHF_ALLOC;
 #endif
+        if (unwindex)
+                sechdrs[unwindex].sh_flags |= SHF_ALLOC;
        /* Check module struct version now, before we try to use module. */
        if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -1729,6 +1737,11 @@ static struct module *load_module(void __user *umod,
                goto arch_cleanup;
        add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
+        /* Size of section 0 is 0, so this works well if no unwind info. */
+        mod->unwind_info = unwind_add_table(mod,
+                                            (void *)sechdrs[unwindex].sh_addr,
+                                            sechdrs[unwindex].sh_size);
        /* Get rid of temporary copy */
        vfree(hdr);
@@ -1827,6 +1840,7 @@ sys_init_module(void __user *umod,
        mod->state = MODULE_STATE_LIVE;
        /* Drop initial reference. */
        module_put(mod);
+        unwind_remove_table(mod->unwind_info, 1);
        module_free(mod, mod->module_init);
        mod->module_init = NULL;
        mod->init_size = 0;
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index f4913c376950..036b6285b15c 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -153,13 +153,13 @@ next:
                        continue;
                count++;
                cursor = curr->next;
-                debug_spin_lock_restore(&debug_mutex_lock, flags);
+                debug_spin_unlock_restore(&debug_mutex_lock, flags);
                printk("\n#%03d:            ", count);
                printk_lock(lock, filter ? 0 : 1);
                goto next;
        }
-        debug_spin_lock_restore(&debug_mutex_lock, flags);
+        debug_spin_unlock_restore(&debug_mutex_lock, flags);
        printk("\n");
 }
@@ -316,7 +316,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task)
                        continue;
                list_del_init(curr);
                DEBUG_OFF();
-                debug_spin_lock_restore(&debug_mutex_lock, flags);
+                debug_spin_unlock_restore(&debug_mutex_lock, flags);
                printk("BUG: %s/%d, lock held at task exit time!\n",
                        task->comm, task->pid);
@@ -325,7 +325,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task)
                        printk("exiting task is not even the owner??\n");
                return;
        }
-        debug_spin_lock_restore(&debug_mutex_lock, flags);
+        debug_spin_unlock_restore(&debug_mutex_lock, flags);
 }
 /*
@@ -352,7 +352,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
                        continue;
                list_del_init(curr);
                DEBUG_OFF();
-                debug_spin_lock_restore(&debug_mutex_lock, flags);
+                debug_spin_unlock_restore(&debug_mutex_lock, flags);
                printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
                        current->comm, current->pid, lock, from, to);
@@ -362,7 +362,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
                        printk("freeing task is not even the owner??\n");
                return;
        }
-        debug_spin_lock_restore(&debug_mutex_lock, flags);
+        debug_spin_unlock_restore(&debug_mutex_lock, flags);
 }
 /*
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index fd384050acb1..a5196c36a5fd 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -46,21 +46,6 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 extern void debug_mutex_unlock(struct mutex *lock);
 extern void debug_mutex_init(struct mutex *lock, const char *name);
-#define debug_spin_lock(lock)                           \
-        do {                                            \
-                local_irq_disable();                    \
-                if (debug_mutex_on)                     \
-                        spin_lock(lock);                \
-        } while (0)
-#define debug_spin_unlock(lock)                         \
-        do {                                            \
-                if (debug_mutex_on)                     \
-                        spin_unlock(lock);              \
-                local_irq_enable();                     \
-                preempt_check_resched();                \
-        } while (0)
 #define debug_spin_lock_save(lock, flags)               \
        do {                                            \
                local_irq_save(flags);                  \
@@ -68,7 +53,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name);
                        spin_lock(lock);                \
        } while (0)
-#define debug_spin_lock_restore(lock, flags)            \
+#define debug_spin_unlock_restore(lock, flags)          \
        do {                                            \
                if (debug_mutex_on)                     \
                        spin_unlock(lock);              \
@@ -76,20 +61,20 @@ extern void debug_mutex_init(struct mutex *lock, const char *name);
                preempt_check_resched();                \
        } while (0)
-#define spin_lock_mutex(lock)                           \
+#define spin_lock_mutex(lock, flags)                    \
        do {                                            \
                struct mutex *l = container_of(lock, struct mutex, wait_lock); \
                                                        \
                DEBUG_WARN_ON(in_interrupt());          \
-                debug_spin_lock(&debug_mutex_lock);     \
+                debug_spin_lock_save(&debug_mutex_lock, flags); \
                spin_lock(lock);                        \
                DEBUG_WARN_ON(l->magic != l);           \
        } while (0)
-#define spin_unlock_mutex(lock)                         \
+#define spin_unlock_mutex(lock, flags)                  \
        do {                                            \
                spin_unlock(lock);                      \
-                debug_spin_unlock(&debug_mutex_lock);   \
+                debug_spin_unlock_restore(&debug_mutex_lock, flags);    \
        } while (0)
 #define DEBUG_OFF()                                     \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5449b210d9ed..7043db21bbce 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -125,10 +125,11 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
        struct task_struct *task = current;
        struct mutex_waiter waiter;
        unsigned int old_val;
+        unsigned long flags;
        debug_mutex_init_waiter(&waiter);
-        spin_lock_mutex(&lock->wait_lock);
+        spin_lock_mutex(&lock->wait_lock, flags);
        debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip);
@@ -157,7 +158,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
                if (unlikely(state == TASK_INTERRUPTIBLE &&
                                                signal_pending(task))) {
                        mutex_remove_waiter(lock, &waiter, task->thread_info);
-                        spin_unlock_mutex(&lock->wait_lock);
+                        spin_unlock_mutex(&lock->wait_lock, flags);
                        debug_mutex_free_waiter(&waiter);
                        return -EINTR;
@@ -165,9 +166,9 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
                __set_task_state(task, state);
                /* didnt get the lock, go to sleep: */
-                spin_unlock_mutex(&lock->wait_lock);
+                spin_unlock_mutex(&lock->wait_lock, flags);
                schedule();
-                spin_lock_mutex(&lock->wait_lock);
+                spin_lock_mutex(&lock->wait_lock, flags);
        }
        /* got the lock - rejoice! */
@@ -178,7 +179,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
        if (likely(list_empty(&lock->wait_list)))
                atomic_set(&lock->count, 0);
-        spin_unlock_mutex(&lock->wait_lock);
+        spin_unlock_mutex(&lock->wait_lock, flags);
        debug_mutex_free_waiter(&waiter);
@@ -203,10 +204,11 @@ static fastcall noinline void
 __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
+        unsigned long flags;
        DEBUG_WARN_ON(lock->owner != current_thread_info());
-        spin_lock_mutex(&lock->wait_lock);
+        spin_lock_mutex(&lock->wait_lock, flags);
        /*
         * some architectures leave the lock unlocked in the fastpath failure
@@ -231,7 +233,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
        debug_mutex_clear_owner(lock);
-        spin_unlock_mutex(&lock->wait_lock);
+        spin_unlock_mutex(&lock->wait_lock, flags);
 }
 /*
@@ -276,9 +278,10 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
 static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
+        unsigned long flags;
        int prev;
-        spin_lock_mutex(&lock->wait_lock);
+        spin_lock_mutex(&lock->wait_lock, flags);
        prev = atomic_xchg(&lock->count, -1);
        if (likely(prev == 1))
@@ -287,7 +290,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
        if (likely(list_empty(&lock->wait_list)))
                atomic_set(&lock->count, 0);
-        spin_unlock_mutex(&lock->wait_lock);
+        spin_unlock_mutex(&lock->wait_lock, flags);
        return prev == 1;
 }
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 00fe84e7b672..069189947257 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -9,8 +9,10 @@
 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
 */
-#define spin_lock_mutex(lock)                   spin_lock(lock)
+#define spin_lock_mutex(lock, flags) \
-#define spin_unlock_mutex(lock)                 spin_unlock(lock)
+                do { spin_lock(lock); (void)(flags); } while (0)
+#define spin_unlock_mutex(lock, flags) \
+                do { spin_unlock(lock); (void)(flags); } while (0)
 #define mutex_remove_waiter(lock, waiter, ti) \
                __list_del((waiter)->list.prev, (waiter)->list.next)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 520f6c59948d..d38d9ec3276c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -555,9 +555,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
        struct cpu_timer_list *next;
        unsigned long i;
-        if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING))
-                return;
        head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
                p->cpu_timers : p->signal->cpu_timers);
        head += CPUCLOCK_WHICH(timer->it_clock);
@@ -1173,6 +1170,9 @@ static void check_process_timers(struct task_struct *tsk,
                }
                t = tsk;
                do {
+                        if (unlikely(t->flags & PF_EXITING))
+                                continue;
                        ticks = cputime_add(cputime_add(t->utime, t->stime),
                                            prof_left);
                        if (!cputime_eq(prof_expires, cputime_zero) &&
@@ -1193,11 +1193,7 @@ static void check_process_timers(struct task_struct *tsk,
                                              t->it_sched_expires > sched)) {
                                t->it_sched_expires = sched;
                        }
+                } while ((t = next_thread(t)) != tsk);
-                        do {
-                                t = next_thread(t);
-                        } while (unlikely(t->flags & PF_EXITING));
-                } while (t != tsk);
        }
 }
@@ -1289,30 +1285,30 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 #undef  UNEXPIRED
-        BUG_ON(tsk->exit_state);
        /*
         * Double-check with locks held.
         */
        read_lock(&tasklist_lock);
-        spin_lock(&tsk->sighand->siglock);
+        if (likely(tsk->signal != NULL)) {
+                spin_lock(&tsk->sighand->siglock);
-        /*
+                /*
-         * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+                 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
-         * all the timers that are firing, and put them on the firing list.
+                 * all the timers that are firing, and put them on the firing list.
-         */
+                 */
-        check_thread_timers(tsk, &firing);
+                check_thread_timers(tsk, &firing);
-        check_process_timers(tsk, &firing);
+                check_process_timers(tsk, &firing);
-        /*
+                /*
-         * We must release these locks before taking any timer's lock.
+                 * We must release these locks before taking any timer's lock.
-         * There is a potential race with timer deletion here, as the
+                 * There is a potential race with timer deletion here, as the
-         * siglock now protects our private firing list.  We have set
+                 * siglock now protects our private firing list.  We have set
-         * the firing flag in each timer, so that a deletion attempt
+                 * the firing flag in each timer, so that a deletion attempt
-         * that gets the timer lock before we do will give it up and
+                 * that gets the timer lock before we do will give it up and
-         * spin until we've taken care of that timer below.
+                 * spin until we've taken care of that timer below.
-         */
+                 */
-        spin_unlock(&tsk->sighand->siglock);
+                spin_unlock(&tsk->sighand->siglock);
+        }
        read_unlock(&tasklist_lock);
        /*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ce0dfb8f4a4e..fc311a4673a2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,6 +36,15 @@ config PM_DEBUG
        code. This is helpful when debugging and reporting various PM bugs, 
        like suspend support.
+config PM_TRACE
+        bool "Suspend/resume event tracing"
+        depends on PM && PM_DEBUG && X86_32
+        default y
+        ---help---
+        This enables some cheesy code to save the last PM event point in the
+        RTC across reboots, so that you can debug a machine that just hangs
+        during suspend (or more commonly, during resume).
 config SOFTWARE_SUSPEND
        bool "Software Suspend"
        depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 81d4d982f3f0..e13e74067845 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -231,7 +231,7 @@ static int software_resume(void)
 late_initcall(software_resume);
-static char * pm_disk_modes[] = {
+static const char * const pm_disk_modes[] = {
        [PM_DISK_FIRMWARE]      = "firmware",
        [PM_DISK_PLATFORM]      = "platform",
        [PM_DISK_SHUTDOWN]      = "shutdown",
diff --git a/kernel/power/main.c b/kernel/power/main.c
index a6d9ef46009e..6d295c776794 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -15,7 +15,7 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/pm.h>
+#include <linux/console.h>
 #include "power.h"
@@ -86,6 +86,7 @@ static int suspend_prepare(suspend_state_t state)
                        goto Thaw;
        }
+        suspend_console();
        if ((error = device_suspend(PMSG_SUSPEND))) {
                printk(KERN_ERR "Some devices failed to suspend\n");
                goto Finish;
@@ -133,6 +134,7 @@ int suspend_enter(suspend_state_t state)
 static void suspend_finish(suspend_state_t state)
 {
        device_resume();
+        resume_console();
        thaw_processes();
        enable_nonboot_cpus();
        if (pm_ops && pm_ops->finish)
@@ -143,7 +145,7 @@ static void suspend_finish(suspend_state_t state)
-static char *pm_states[PM_SUSPEND_MAX] = {
+static const char * const pm_states[PM_SUSPEND_MAX] = {
        [PM_SUSPEND_STANDBY]    = "standby",
        [PM_SUSPEND_MEM]        = "mem",
 #ifdef CONFIG_SOFTWARE_SUSPEND
@@ -260,7 +262,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
 static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
 {
        suspend_state_t state = PM_SUSPEND_STANDBY;
-        char ** s;
+        const char * const *s;
        char *p;
        int error;
        int len;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index f06f12f21767..57a792982fb9 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -55,7 +55,7 @@ struct snapshot_handle {
        unsigned int    page;
        unsigned int    page_offset;
        unsigned int    prev;
-        struct pbe      *pbe;
+        struct pbe      *pbe, *last_pbe;
        void            *buffer;
        unsigned int    buf_offset;
 };
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3eeedbb13b78..24c96f354231 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -150,6 +150,10 @@ int restore_highmem(void)
        }
        return 0;
 }
+#else
+static inline unsigned int count_highmem_pages(void) {return 0;}
+static inline int save_highmem(void) {return 0;}
+static inline int restore_highmem(void) {return 0;}
 #endif
 static int pfn_is_nosave(unsigned long pfn)
@@ -293,62 +297,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
        }
 }
-/**
+static unsigned int unsafe_pages;
- *      On resume it is necessary to trace and eventually free the unsafe
- *      pages that have been allocated, because they are needed for I/O
- *      (on x86-64 we likely will "eat" these pages once again while
- *      creating the temporary page translation tables)
- */
-struct eaten_page {
-        struct eaten_page *next;
-        char padding[PAGE_SIZE - sizeof(void *)];
-};
-static struct eaten_page *eaten_pages = NULL;
-static void release_eaten_pages(void)
-{
-        struct eaten_page *p, *q;
-        p = eaten_pages;
-        while (p) {
-                q = p->next;
-                /* We don't want swsusp_free() to free this page again */
-                ClearPageNosave(virt_to_page(p));
-                free_page((unsigned long)p);
-                p = q;
-        }
-        eaten_pages = NULL;
-}
 /**
 *      @safe_needed - on resume, for storing the PBE list and the image,
 *      we can only use memory pages that do not conflict with the pages
- *      which had been used before suspend.
+ *      used before suspend.
 *
 *      The unsafe pages are marked with the PG_nosave_free flag
- *
+ *      and we count them using unsafe_pages
- *      Allocated but unusable (ie eaten) memory pages should be marked
- *      so that swsusp_free() can release them
 */
 static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 {
        void *res;
+        res = (void *)get_zeroed_page(gfp_mask);
        if (safe_needed)
-                do {
+                while (res && PageNosaveFree(virt_to_page(res))) {
+                        /* The page is unsafe, mark it for swsusp_free() */
+                        SetPageNosave(virt_to_page(res));
+                        unsafe_pages++;
                        res = (void *)get_zeroed_page(gfp_mask);
-                        if (res && PageNosaveFree(virt_to_page(res))) {
+                }
-                                /* This is for swsusp_free() */
-                                SetPageNosave(virt_to_page(res));
-                                ((struct eaten_page *)res)->next = eaten_pages;
-                                eaten_pages = res;
-                        }
-                } while (res && PageNosaveFree(virt_to_page(res)));
-        else
-                res = (void *)get_zeroed_page(gfp_mask);
        if (res) {
                SetPageNosave(virt_to_page(res));
                SetPageNosaveFree(virt_to_page(res));
@@ -374,7 +345,8 @@ unsigned long get_safe_page(gfp_t gfp_mask)
 *      On each page we set up a list of struct_pbe elements.
 */
-struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed)
+static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
+                                 int safe_needed)
 {
        unsigned int num;
        struct pbe *pblist, *pbe;
@@ -642,6 +614,8 @@ static int mark_unsafe_pages(struct pbe *pblist)
                        return -EFAULT;
        }
+        unsafe_pages = 0;
        return 0;
 }
@@ -719,42 +693,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
 }
 /**
- *      create_image - use metadata contained in the PBE list
+ *      prepare_image - use metadata contained in the PBE list
 *      pointed to by pagedir_nosave to mark the pages that will
 *      be overwritten in the process of restoring the system
- *      memory state from the image and allocate memory for
+ *      memory state from the image ("unsafe" pages) and allocate
- *      the image avoiding these pages
+ *      memory for the image
+ *
+ *      The idea is to allocate the PBE list first and then
+ *      allocate as many pages as it's needed for the image data,
+ *      but not to assign these pages to the PBEs initially.
+ *      Instead, we just mark them as allocated and create a list
+ *      of "safe" which will be used later
 */
-static int create_image(struct snapshot_handle *handle)
+struct safe_page {
+        struct safe_page *next;
+        char padding[PAGE_SIZE - sizeof(void *)];
+};
+static struct safe_page *safe_pages;
+static int prepare_image(struct snapshot_handle *handle)
 {
        int error = 0;
-        struct pbe *p, *pblist;
+        unsigned int nr_pages = nr_copy_pages;
+        struct pbe *p, *pblist = NULL;
        p = pagedir_nosave;
        error = mark_unsafe_pages(p);
        if (!error) {
-                pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+                pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
                if (pblist)
                        copy_page_backup_list(pblist, p);
                free_pagedir(p, 0);
                if (!pblist)
                        error = -ENOMEM;
        }
-        if (!error)
+        safe_pages = NULL;
-                error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+        if (!error && nr_pages > unsafe_pages) {
+                nr_pages -= unsafe_pages;
+                while (nr_pages--) {
+                        struct safe_page *ptr;
+                        ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC);
+                        if (!ptr) {
+                                error = -ENOMEM;
+                                break;
+                        }
+                        if (!PageNosaveFree(virt_to_page(ptr))) {
+                                /* The page is "safe", add it to the list */
+                                ptr->next = safe_pages;
+                                safe_pages = ptr;
+                        }
+                        /* Mark the page as allocated */
+                        SetPageNosave(virt_to_page(ptr));
+                        SetPageNosaveFree(virt_to_page(ptr));
+                }
+        }
        if (!error) {
-                release_eaten_pages();
                pagedir_nosave = pblist;
        } else {
-                pagedir_nosave = NULL;
                handle->pbe = NULL;
-                nr_copy_pages = 0;
+                swsusp_free();
-                nr_meta_pages = 0;
        }
        return error;
 }
+static void *get_buffer(struct snapshot_handle *handle)
+{
+        struct pbe *pbe = handle->pbe, *last = handle->last_pbe;
+        struct page *page = virt_to_page(pbe->orig_address);
+        if (PageNosave(page) && PageNosaveFree(page)) {
+                /*
+                 * We have allocated the "original" page frame and we can
+                 * use it directly to store the read page
+                 */
+                pbe->address = 0;
+                if (last && last->next)
+                        last->next = NULL;
+                return (void *)pbe->orig_address;
+        }
+        /*
+         * The "original" page frame has not been allocated and we have to
+         * use a "safe" page frame to store the read page
+         */
+        pbe->address = (unsigned long)safe_pages;
+        safe_pages = safe_pages->next;
+        if (last)
+                last->next = pbe;
+        handle->last_pbe = pbe;
+        return (void *)pbe->address;
+}
 /**
 *      snapshot_write_next - used for writing the system memory snapshot.
 *
@@ -799,15 +830,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
                } else if (handle->prev <= nr_meta_pages) {
                        handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
                        if (!handle->pbe) {
-                                error = create_image(handle);
+                                error = prepare_image(handle);
                                if (error)
                                        return error;
                                handle->pbe = pagedir_nosave;
-                                handle->buffer = (void *)handle->pbe->address;
+                                handle->last_pbe = NULL;
+                                handle->buffer = get_buffer(handle);
                        }
                } else {
                        handle->pbe = handle->pbe->next;
-                        handle->buffer = (void *)handle->pbe->address;
+                        handle->buffer = get_buffer(handle);
                }
                handle->prev = handle->page;
        }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c4016cbbd3e0..17f669c83012 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -67,9 +67,9 @@ unsigned int count_highmem_pages(void);
 int save_highmem(void);
 int restore_highmem(void);
 #else
-static int save_highmem(void) { return 0; }
+static inline int save_highmem(void) { return 0; }
-static int restore_highmem(void) { return 0; }
+static inline int restore_highmem(void) { return 0; }
-static unsigned int count_highmem_pages(void) { return 0; }
+static inline unsigned int count_highmem_pages(void) { return 0; }
 #endif
 /**
@@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
 */
 #define SHRINK_BITE     10000
+static inline unsigned long __shrink_memory(long tmp)
+{
+        if (tmp > SHRINK_BITE)
+                tmp = SHRINK_BITE;
+        return shrink_all_memory(tmp);
+}
 int swsusp_shrink_memory(void)
 {
@@ -192,15 +198,17 @@ int swsusp_shrink_memory(void)
                        PAGES_FOR_IO;
                tmp = size;
                for_each_zone (zone)
-                        if (!is_highmem(zone))
+                        if (!is_highmem(zone) && populated_zone(zone)) {
                                tmp -= zone->free_pages;
+                                tmp += zone->lowmem_reserve[ZONE_NORMAL];
+                        }
                if (tmp > 0) {
-                        tmp = shrink_all_memory(SHRINK_BITE);
+                        tmp = __shrink_memory(tmp);
                        if (!tmp)
                                return -ENOMEM;
                        pages += tmp;
                } else if (size > image_size / PAGE_SIZE) {
-                        tmp = shrink_all_memory(SHRINK_BITE);
+                        tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
                        pages += tmp;
                }
                printk("\b%c", p[i++%4]);
diff --git a/kernel/printk.c b/kernel/printk.c
index c056f3324432..95b7fe17f124 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -24,6 +24,7 @@
 #include <linux/console.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/interrupt.h>                    /* For in_interrupt() */
 #include <linux/config.h>
 #include <linux/delay.h>
@@ -67,6 +68,7 @@ EXPORT_SYMBOL(oops_in_progress);
 * driver system.
 */
 static DECLARE_MUTEX(console_sem);
+static DECLARE_MUTEX(secondary_console_sem);
 struct console *console_drivers;
 /*
 * This is used for debugging the mess that is the VT code by
@@ -76,7 +78,7 @@ struct console *console_drivers;
 * path in the console code where we end up in places I want
 * locked without the console sempahore held
 */
-static int console_locked;
+static int console_locked, console_suspended;
 /*
 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
@@ -326,7 +328,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
        struct console *con;
        for (con = console_drivers; con; con = con->next) {
-                if ((con->flags & CON_ENABLED) && con->write)
+                if ((con->flags & CON_ENABLED) && con->write &&
+                                (cpu_online(smp_processor_id()) ||
+                                (con->flags & CON_ANYTIME)))
                        con->write(con, &LOG_BUF(start), end - start);
        }
 }
@@ -436,6 +440,7 @@ static int printk_time = 1;
 #else
 static int printk_time = 0;
 #endif
+module_param(printk_time, int, S_IRUGO | S_IWUSR);
 static int __init printk_time_setup(char *str)
 {
@@ -452,6 +457,18 @@ __attribute__((weak)) unsigned long long printk_clock(void)
        return sched_clock();
 }
+/* Check if we have any console registered that can be called early in boot. */
+static int have_callable_console(void)
+{
+        struct console *con;
+        for (con = console_drivers; con; con = con->next)
+                if (con->flags & CON_ANYTIME)
+                        return 1;
+        return 0;
+}
 /**
 * printk - print a kernel message
 * @fmt: format string
@@ -565,27 +582,29 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                        log_level_unknown = 1;
        }
-        if (!cpu_online(smp_processor_id())) {
+        if (!down_trylock(&console_sem)) {
                /*
-                 * Some console drivers may assume that per-cpu resources have
+                 * We own the drivers.  We can drop the spinlock and
-                 * been allocated.  So don't allow them to be called by this
+                 * let release_console_sem() print the text, maybe ...
-                 * CPU until it is officially up.  We shouldn't be calling into
-                 * random console drivers on a CPU which doesn't exist yet..
                 */
+                console_locked = 1;
                printk_cpu = UINT_MAX;
                spin_unlock_irqrestore(&logbuf_lock, flags);
-                goto out;
-        }
-        if (!down_trylock(&console_sem)) {
-                console_locked = 1;
                /*
-                 * We own the drivers.  We can drop the spinlock and let
+                 * Console drivers may assume that per-cpu resources have
-                 * release_console_sem() print the text
+                 * been allocated. So unless they're explicitly marked as
+                 * being able to cope (CON_ANYTIME) don't call them until
+                 * this CPU is officially up.
                 */
-                printk_cpu = UINT_MAX;
+                if (cpu_online(smp_processor_id()) || have_callable_console()) {
-                spin_unlock_irqrestore(&logbuf_lock, flags);
+                        console_may_schedule = 0;
-                console_may_schedule = 0;
+                        release_console_sem();
-                release_console_sem();
+                } else {
+                        /* Release by hand to avoid flushing the buffer. */
+                        console_locked = 0;
+                        up(&console_sem);
+                }
        } else {
                /*
                 * Someone else owns the drivers.  We drop the spinlock, which
@@ -595,7 +614,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                printk_cpu = UINT_MAX;
                spin_unlock_irqrestore(&logbuf_lock, flags);
        }
-out:
        preempt_enable();
        return printed_len;
 }
@@ -698,6 +717,23 @@ int __init add_preferred_console(char *name, int idx, char *options)
 }
 /**
+ * suspend_console - suspend the console subsystem
+ *
+ * This disables printk() while we go into suspend states
+ */
+void suspend_console(void)
+{
+        acquire_console_sem();
+        console_suspended = 1;
+}
+void resume_console(void)
+{
+        console_suspended = 0;
+        release_console_sem();
+}
+/**
 * acquire_console_sem - lock the console system for exclusive use.
 *
 * Acquires a semaphore which guarantees that the caller has
@@ -708,6 +744,10 @@ int __init add_preferred_console(char *name, int idx, char *options)
 void acquire_console_sem(void)
 {
        BUG_ON(in_interrupt());
+        if (console_suspended) {
+                down(&secondary_console_sem);
+                return;
+        }
        down(&console_sem);
        console_locked = 1;
        console_may_schedule = 1;
@@ -750,6 +790,10 @@ void release_console_sem(void)
        unsigned long _con_start, _log_end;
        unsigned long wake_klogd = 0;
+        if (console_suspended) {
+                up(&secondary_console_sem);
+                return;
+        }
        for ( ; ; ) {
                spin_lock_irqsave(&logbuf_lock, flags);
                wake_klogd |= log_start - log_end;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 921c22ad16e4..335c5b932e14 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 static int may_attach(struct task_struct *task)
 {
-        if (!task->mm)
+        /* May we inspect the given task?
-                return -EPERM;
+         * This check is used both for attaching with ptrace
+         * and for allowing access to sensitive information in /proc.
+         *
+         * ptrace_attach denies several cases that /proc allows
+         * because setting up the necessary parent/child relationship
+         * or halting the specified task is impossible.
+         */
+        int dumpable = 0;
+        /* Don't let security modules deny introspection */
+        if (task == current)
+                return 0;
        if (((current->uid != task->euid) ||
             (current->uid != task->suid) ||
             (current->uid != task->uid) ||
@@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task)
             (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
                return -EPERM;
        smp_rmb();
-        if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
+        if (task->mm)
+                dumpable = task->mm->dumpable;
+        if (!dumpable && !capable(CAP_SYS_PTRACE))
                return -EPERM;
        return security_ptrace(current, task);
@@ -176,6 +188,8 @@ repeat:
                goto repeat;
        }
+        if (!task->mm)
+                goto bad;
        /* the same process cannot be attached many times */
        if (task->ptrace & PT_PTRACED)
                goto bad;
@@ -200,7 +214,7 @@ out:
        return retval;
 }
-void __ptrace_detach(struct task_struct *child, unsigned int data)
+static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
 {
        child->exit_code = data;
        /* .. re-parent .. */
@@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
        ptrace_disable(child);
        write_lock_irq(&tasklist_lock);
+        /* protect against de_thread()->release_task() */
        if (child->ptrace)
                __ptrace_detach(child, data);
        write_unlock_irq(&tasklist_lock);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2058f88c7bbb..20e9710fc21c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -612,14 +612,6 @@ void synchronize_rcu(void)
        wait_for_completion(&rcu.completion);
 }
-/*
- * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
- */
-void synchronize_kernel(void)
-{
-        synchronize_rcu();
-}
 module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
@@ -627,7 +619,6 @@ module_param(qlowmark, int, 0);
 module_param(rsinterval, int, 0);
 #endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL_GPL_FUTURE(call_rcu);     /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(call_rcu);
-EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(call_rcu_bh);
 EXPORT_SYMBOL_GPL(synchronize_rcu);
-EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
diff --git a/kernel/sched.c b/kernel/sched.c
index c13f1bd2df7d..a856040c200a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -818,6 +818,11 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 * the target CPU.
 */
 #ifdef CONFIG_SMP
+#ifndef tsk_is_polling
+#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#endif
 static void resched_task(task_t *p)
 {
        int cpu;
@@ -833,9 +838,9 @@ static void resched_task(task_t *p)
        if (cpu == smp_processor_id())
                return;
-        /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */
+        /* NEED_RESCHED must be visible before we test polling */
        smp_mb();
-        if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG))
+        if (!tsk_is_polling(p))
                smp_send_reschedule(cpu);
 }
 #else
@@ -3886,6 +3891,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
                        !capable(CAP_SYS_NICE))
                goto out_unlock;
+        retval = security_task_setscheduler(p, 0, NULL);
+        if (retval)
+                goto out_unlock;
        cpus_allowed = cpuset_cpus_allowed(p);
        cpus_and(new_mask, new_mask, cpus_allowed);
        retval = set_cpus_allowed(p, new_mask);
@@ -3954,7 +3963,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
        if (!p)
                goto out_unlock;
-        retval = 0;
+        retval = security_task_getscheduler(p);
+        if (retval)
+                goto out_unlock;
        cpus_and(*mask, p->cpus_allowed, cpu_online_map);
 out_unlock:
@@ -4046,6 +4058,9 @@ asmlinkage long sys_sched_yield(void)
 static inline void __cond_resched(void)
 {
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+        __might_sleep(__FILE__, __LINE__);
+#endif
        /*
         * The BKS might be reacquired before we have dropped
         * PREEMPT_ACTIVE, which could trigger a second
@@ -4142,7 +4157,7 @@ EXPORT_SYMBOL(yield);
 */
 void __sched io_schedule(void)
 {
-        struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
+        struct runqueue *rq = &__raw_get_cpu_var(runqueues);
        atomic_inc(&rq->nr_iowait);
        schedule();
@@ -4153,7 +4168,7 @@ EXPORT_SYMBOL(io_schedule);
 long __sched io_schedule_timeout(long timeout)
 {
-        struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
+        struct runqueue *rq = &__raw_get_cpu_var(runqueues);
        long ret;
        atomic_inc(&rq->nr_iowait);
@@ -4237,7 +4252,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
        if (retval)
                goto out_unlock;
-        jiffies_to_timespec(p->policy & SCHED_FIFO ?
+        jiffies_to_timespec(p->policy == SCHED_FIFO ?
                                0 : task_timeslice(p), &t);
        read_unlock(&tasklist_lock);
        retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -4746,6 +4761,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
+                if (!cpu_rq(cpu)->migration_thread)
+                        break;
                /* Unbind it from offline cpu so it can run.  Fall thru. */
                kthread_bind(cpu_rq(cpu)->migration_thread,
                             any_online_cpu(cpu_online_map));
diff --git a/kernel/signal.c b/kernel/signal.c
index e5f8aea78ffe..52adf53929f6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,12 +23,12 @@
 #include <linux/syscalls.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
-#include <linux/audit.h>
 #include <linux/capability.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/siginfo.h>
+#include "audit.h"      /* audit_signal_info() */
 /*
 * SLAB caches for signal bits.
@@ -1531,6 +1531,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
        spin_unlock_irqrestore(&sighand->siglock, flags);
 }
+static inline int may_ptrace_stop(void)
+{
+        if (!likely(current->ptrace & PT_PTRACED))
+                return 0;
+        if (unlikely(current->parent == current->real_parent &&
+                    (current->ptrace & PT_ATTACHED)))
+                return 0;
+        if (unlikely(current->signal == current->parent->signal) &&
+            unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))
+                return 0;
+        /*
+         * Are we in the middle of do_coredump?
+         * If so and our tracer is also part of the coredump stopping
+         * is a deadlock situation, and pointless because our tracer
+         * is dead so don't allow us to stop.
+         * If SIGKILL was already sent before the caller unlocked
+         * ->siglock we must see ->core_waiters != 0. Otherwise it
+         * is safe to enter schedule().
+         */
+        if (unlikely(current->mm->core_waiters) &&
+            unlikely(current->mm == current->parent->mm))
+                return 0;
+        return 1;
+}
 /*
 * This must be called with current->sighand->siglock held.
 *
@@ -1559,11 +1588,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
        spin_unlock_irq(&current->sighand->siglock);
        try_to_freeze();
        read_lock(&tasklist_lock);
-        if (likely(current->ptrace & PT_PTRACED) &&
+        if (may_ptrace_stop()) {
-            likely(current->parent != current->real_parent ||
-                   !(current->ptrace & PT_ATTACHED)) &&
-            (likely(current->parent->signal != current->signal) ||
-             !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
                do_notify_parent_cldstop(current, CLD_TRAPPED);
                read_unlock(&tasklist_lock);
                schedule();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 336f92d64e2e..9e2f1c6e73d7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -470,6 +470,8 @@ static int cpu_callback(struct notifier_block *nfb,
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
+                if (!per_cpu(ksoftirqd, hotcpu))
+                        break;
                /* Unbind so it can run.  Fall thru. */
                kthread_bind(per_cpu(ksoftirqd, hotcpu),
                             any_online_cpu(cpu_online_map));
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 14c7faf02909..b5c3b94e01ce 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
 void touch_softlockup_watchdog(void)
 {
-        per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies;
+        __raw_get_cpu_var(touch_timestamp) = jiffies;
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
@@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
+                if (!per_cpu(watchdog_task, hotcpu))
+                        break;
                /* Unbind so it can run.  Fall thru. */
                kthread_bind(per_cpu(watchdog_task, hotcpu),
                             any_online_cpu(cpu_online_map));
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index dcfb5d731466..2c0aacc37c55 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -4,6 +4,7 @@
 #include <linux/cpu.h>
 #include <linux/err.h>
 #include <linux/syscalls.h>
+#include <linux/kthread.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/uaccess.h>
@@ -25,13 +26,11 @@ static unsigned int stopmachine_num_threads;
 static atomic_t stopmachine_thread_ack;
 static DECLARE_MUTEX(stopmachine_mutex);
-static int stopmachine(void *cpu)
+static int stopmachine(void *unused)
 {
        int irqs_disabled = 0;
        int prepared = 0;
-        set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
        /* Ack: we are alive */
        smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
        atomic_inc(&stopmachine_thread_ack);
@@ -85,7 +84,8 @@ static void stopmachine_set_state(enum stopmachine_state state)
 static int stop_machine(void)
 {
-        int i, ret = 0;
+        int ret = 0;
+        unsigned int i;
        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
        /* One high-prio thread per cpu.  We'll do this one. */
@@ -96,11 +96,16 @@ static int stop_machine(void)
        stopmachine_state = STOPMACHINE_WAIT;
        for_each_online_cpu(i) {
+                struct task_struct *tsk;
                if (i == raw_smp_processor_id())
                        continue;
-                ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
+                tsk = kthread_create(stopmachine, NULL, "stopmachine");
-                if (ret < 0)
+                if (IS_ERR(tsk)) {
+                        ret = PTR_ERR(tsk);
                        break;
+                }
+                kthread_bind(tsk, i);
+                wake_up_process(tsk);
                stopmachine_num_threads++;
        }
diff --git a/kernel/sys.c b/kernel/sys.c
index 0b6ec0e7936f..2d5179c67cec 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -13,7 +13,6 @@
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
-#include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
@@ -57,6 +56,12 @@
 #ifndef GET_FPEXC_CTL
 # define GET_FPEXC_CTL(a,b)     (-EINVAL)
 #endif
+#ifndef GET_ENDIAN
+# define GET_ENDIAN(a,b)        (-EINVAL)
+#endif
+#ifndef SET_ENDIAN
+# define SET_ENDIAN(a,b)        (-EINVAL)
+#endif
 /*
 * this is where the system-wide overflow UID and GID are defined, for
@@ -132,14 +137,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
                unsigned long val, void *v)
 {
        int ret = NOTIFY_DONE;
-        struct notifier_block *nb;
+        struct notifier_block *nb, *next_nb;
        nb = rcu_dereference(*nl);
        while (nb) {
+                next_nb = rcu_dereference(nb->next);
                ret = nb->notifier_call(nb, val, v);
                if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
                        break;
-                nb = rcu_dereference(nb->next);
+                nb = next_nb;
        }
        return ret;
 }
@@ -583,7 +589,7 @@ void emergency_restart(void)
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
-void kernel_restart_prepare(char *cmd)
+static void kernel_restart_prepare(char *cmd)
 {
        blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
        system_state = SYSTEM_RESTART;
@@ -617,7 +623,7 @@ EXPORT_SYMBOL_GPL(kernel_restart);
 *      Move into place and start executing a preloaded standalone
 *      executable.  If nothing was preloaded return an error.
 */
-void kernel_kexec(void)
+static void kernel_kexec(void)
 {
 #ifdef CONFIG_KEXEC
        struct kimage *image;
@@ -631,7 +637,6 @@ void kernel_kexec(void)
        machine_kexec(image);
 #endif
 }
-EXPORT_SYMBOL_GPL(kernel_kexec);
 void kernel_shutdown_prepare(enum system_states state)
 {
@@ -1860,23 +1865,20 @@ out:
 * fields when reaping, so a sample either gets all the additions of a
 * given child after it's reaped, or none so this sample is before reaping.
 *
- * tasklist_lock locking optimisation:
+ * Locking:
- * If we are current and single threaded, we do not need to take the tasklist
+ * We need to take the siglock for CHILDEREN, SELF and BOTH
- * lock or the siglock.  No one else can take our signal_struct away,
+ * for  the cases current multithreaded, non-current single threaded
- * no one else can reap the children to update signal->c* counters, and
+ * non-current multithreaded.  Thread traversal is now safe with
- * no one else can race with the signal-> fields.
+ * the siglock held.
- * If we do not take the tasklist_lock, the signal-> fields could be read
+ * Strictly speaking, we donot need to take the siglock if we are current and
- * out of order while another thread was just exiting. So we place a
+ * single threaded,  as no one else can take our signal_struct away, no one
- * read memory barrier when we avoid the lock.  On the writer side,
+ * else can  reap the  children to update signal->c* counters, and no one else
- * write memory barrier is implied in  __exit_signal as __exit_signal releases
+ * can race with the signal-> fields. If we do not take any lock, the
- * the siglock spinlock after updating the signal-> fields.
+ * signal-> fields could be read out of order while another thread was just
- *
+ * exiting. So we should  place a read memory barrier when we avoid the lock.
- * We don't really need the siglock when we access the non c* fields
+ * On the writer side,  write memory barrier is implied in  __exit_signal
- * of the signal_struct (for RUSAGE_SELF) even in multithreaded
+ * as __exit_signal releases  the siglock spinlock after updating the signal->
- * case, since we take the tasklist lock for read and the non c* signal->
+ * fields. But we don't do this yet to keep things simple.
- * fields are updated only in __exit_signal, which is called with
- * tasklist_lock taken for write, hence these two threads cannot execute
- * concurrently.
 *
 */
@@ -1885,35 +1887,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        struct task_struct *t;
        unsigned long flags;
        cputime_t utime, stime;
-        int need_lock = 0;
        memset((char *) r, 0, sizeof *r);
        utime = stime = cputime_zero;
-        if (p != current || !thread_group_empty(p))
+        rcu_read_lock();
-                need_lock = 1;
+        if (!lock_task_sighand(p, &flags)) {
+                rcu_read_unlock();
-        if (need_lock) {
+                return;
-                read_lock(&tasklist_lock);
+        }
-                if (unlikely(!p->signal)) {
-                        read_unlock(&tasklist_lock);
-                        return;
-                }
-        } else
-                /* See locking comments above */
-                smp_rmb();
        switch (who) {
                case RUSAGE_BOTH:
                case RUSAGE_CHILDREN:
-                        spin_lock_irqsave(&p->sighand->siglock, flags);
                        utime = p->signal->cutime;
                        stime = p->signal->cstime;
                        r->ru_nvcsw = p->signal->cnvcsw;
                        r->ru_nivcsw = p->signal->cnivcsw;
                        r->ru_minflt = p->signal->cmin_flt;
                        r->ru_majflt = p->signal->cmaj_flt;
-                        spin_unlock_irqrestore(&p->sighand->siglock, flags);
                        if (who == RUSAGE_CHILDREN)
                                break;
@@ -1941,8 +1933,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                        BUG();
        }
-        if (need_lock)
+        unlock_task_sighand(p, &flags);
-                read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        cputime_to_timeval(utime, &r->ru_utime);
        cputime_to_timeval(stime, &r->ru_stime);
 }
@@ -2057,6 +2050,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
                                return -EFAULT;
                        return 0;
                }
+                case PR_GET_ENDIAN:
+                        error = GET_ENDIAN(current, arg2);
+                        break;
+                case PR_SET_ENDIAN:
+                        error = SET_ENDIAN(current, arg2);
+                        break;
                default:
                        error = -EINVAL;
                        break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5433195040f1..6991bece67e8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
 cond_syscall(sys_migrate_pages);
+cond_syscall(sys_move_pages);
 cond_syscall(sys_chown16);
 cond_syscall(sys_fchown16);
 cond_syscall(sys_getegid16);
@@ -132,3 +133,4 @@ cond_syscall(sys_mincore);
 cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
+cond_syscall(compat_sys_move_pages);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e82726faeeff..f1a4eb1a655e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -59,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
 extern int C_A_D;
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
+extern int sysctl_panic_on_oom;
 extern int max_threads;
 extern int sysrq_enabled;
 extern int core_uses_pid;
@@ -72,6 +73,7 @@ extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
+extern int compat_log;
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -142,7 +144,6 @@ static struct ctl_table_header root_table_header =
 static ctl_table kern_table[];
 static ctl_table vm_table[];
-static ctl_table proc_table[];
 static ctl_table fs_table[];
 static ctl_table debug_table[];
 static ctl_table dev_table[];
@@ -150,7 +151,7 @@ extern ctl_table random_table[];
 #ifdef CONFIG_UNIX98_PTYS
 extern ctl_table pty_table[];
 #endif
-#ifdef CONFIG_INOTIFY
+#ifdef CONFIG_INOTIFY_USER
 extern ctl_table inotify_table[];
 #endif
@@ -202,12 +203,6 @@ static ctl_table root_table[] = {
        },
 #endif
        {
-                .ctl_name       = CTL_PROC,
-                .procname       = "proc",
-                .mode           = 0555,
-                .child          = proc_table,
-        },
-        {
                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
@@ -398,7 +393,7 @@ static ctl_table kern_table[] = {
                .strategy       = &sysctl_string,
        },
 #endif
-#ifdef CONFIG_HOTPLUG
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
        {
                .ctl_name       = KERN_HOTPLUG,
                .procname       = "hotplug",
@@ -683,6 +678,16 @@ static ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
+#ifdef CONFIG_COMPAT
+        {
+                .ctl_name       = KERN_COMPAT_LOG,
+                .procname       = "compat-log",
+                .data           = &compat_log,
+                .maxlen         = sizeof (int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+#endif
        { .ctl_name = 0 }
 };
@@ -702,6 +707,14 @@ static ctl_table vm_table[] = {
                .proc_handler   = &proc_dointvec,
        },
        {
+                .ctl_name       = VM_PANIC_ON_OOM,
+                .procname       = "panic_on_oom",
+                .data           = &sysctl_panic_on_oom,
+                .maxlen         = sizeof(sysctl_panic_on_oom),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
                .ctl_name       = VM_OVERCOMMIT_RATIO,
                .procname       = "overcommit_ratio",
                .data           = &sysctl_overcommit_ratio,
@@ -918,10 +931,6 @@ static ctl_table vm_table[] = {
        { .ctl_name = 0 }
 };
-static ctl_table proc_table[] = {
-        { .ctl_name = 0 }
-};
 static ctl_table fs_table[] = {
        {
                .ctl_name       = FS_NRINODE,
@@ -1028,7 +1037,7 @@ static ctl_table fs_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_doulongvec_minmax,
        },
-#ifdef CONFIG_INOTIFY
+#ifdef CONFIG_INOTIFY_USER
        {
                .ctl_name       = FS_INOTIFY,
                .procname       = "inotify",
diff --git a/kernel/time.c b/kernel/time.c
index b00ddc71cedb..5bd489747643 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday);
 #else
+#ifndef CONFIG_GENERIC_TIME
 /*
 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
 * and therefore only yields usec accuracy
@@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv)
 }
 EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
+#endif
 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
new file mode 100644
index 000000000000..e1dfd8e86cce
--- /dev/null
+++ b/kernel/time/Makefile
@@ -0,0 +1 @@
+obj-y += clocksource.o jiffies.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
new file mode 100644
index 000000000000..74eca5939bd9
--- /dev/null
+++ b/kernel/time/clocksource.c
@@ -0,0 +1,349 @@
+/*
+ * linux/kernel/time/clocksource.c
+ *
+ * This file contains the functions which manage clocksource drivers.
+ *
+ * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * TODO WishList:
+ *   o Allow clocksource drivers to be unregistered
+ *   o get rid of clocksource_jiffies extern
+ */
+#include <linux/clocksource.h>
+#include <linux/sysdev.h>
+#include <linux/init.h>
+#include <linux/module.h>
+/* XXX - Would like a better way for initializing curr_clocksource */
+extern struct clocksource clocksource_jiffies;
+/*[Clocksource internal variables]---------
+ * curr_clocksource:
+ *      currently selected clocksource. Initialized to clocksource_jiffies.
+ * next_clocksource:
+ *      pending next selected clocksource.
+ * clocksource_list:
+ *      linked list with the registered clocksources
+ * clocksource_lock:
+ *      protects manipulations to curr_clocksource and next_clocksource
+ *      and the clocksource_list
+ * override_name:
+ *      Name of the user-specified clocksource.
+ */
+static struct clocksource *curr_clocksource = &clocksource_jiffies;
+static struct clocksource *next_clocksource;
+static LIST_HEAD(clocksource_list);
+static DEFINE_SPINLOCK(clocksource_lock);
+static char override_name[32];
+static int finished_booting;
+/* clocksource_done_booting - Called near the end of bootup
+ *
+ * Hack to avoid lots of clocksource churn at boot time
+ */
+static int __init clocksource_done_booting(void)
+{
+        finished_booting = 1;
+        return 0;
+}
+late_initcall(clocksource_done_booting);
+/**
+ * clocksource_get_next - Returns the selected clocksource
+ *
+ */
+struct clocksource *clocksource_get_next(void)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&clocksource_lock, flags);
+        if (next_clocksource && finished_booting) {
+                curr_clocksource = next_clocksource;
+                next_clocksource = NULL;
+        }
+        spin_unlock_irqrestore(&clocksource_lock, flags);
+        return curr_clocksource;
+}
+/**
+ * select_clocksource - Finds the best registered clocksource.
+ *
+ * Private function. Must hold clocksource_lock when called.
+ *
+ * Looks through the list of registered clocksources, returning
+ * the one with the highest rating value. If there is a clocksource
+ * name that matches the override string, it returns that clocksource.
+ */
+static struct clocksource *select_clocksource(void)
+{
+        struct clocksource *best = NULL;
+        struct list_head *tmp;
+        list_for_each(tmp, &clocksource_list) {
+                struct clocksource *src;
+                src = list_entry(tmp, struct clocksource, list);
+                if (!best)
+                        best = src;
+                /* check for override: */
+                if (strlen(src->name) == strlen(override_name) &&
+                    !strcmp(src->name, override_name)) {
+                        best = src;
+                        break;
+                }
+                /* pick the highest rating: */
+                if (src->rating > best->rating)
+                        best = src;
+        }
+        return best;
+}
+/**
+ * is_registered_source - Checks if clocksource is registered
+ * @c:          pointer to a clocksource
+ *
+ * Private helper function. Must hold clocksource_lock when called.
+ *
+ * Returns one if the clocksource is already registered, zero otherwise.
+ */
+static int is_registered_source(struct clocksource *c)
+{
+        int len = strlen(c->name);
+        struct list_head *tmp;
+        list_for_each(tmp, &clocksource_list) {
+                struct clocksource *src;
+                src = list_entry(tmp, struct clocksource, list);
+                if (strlen(src->name) == len && !strcmp(src->name, c->name))
+                        return 1;
+        }
+        return 0;
+}
+/**
+ * clocksource_register - Used to install new clocksources
+ * @t:          clocksource to be registered
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ */
+int clocksource_register(struct clocksource *c)
+{
+        int ret = 0;
+        unsigned long flags;
+        spin_lock_irqsave(&clocksource_lock, flags);
+        /* check if clocksource is already registered */
+        if (is_registered_source(c)) {
+                printk("register_clocksource: Cannot register %s. "
+                        "Already registered!", c->name);
+                ret = -EBUSY;
+        } else {
+                /* register it */
+                list_add(&c->list, &clocksource_list);
+                /* scan the registered clocksources, and pick the best one */
+                next_clocksource = select_clocksource();
+        }
+        spin_unlock_irqrestore(&clocksource_lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL(clocksource_register);
+/**
+ * clocksource_reselect - Rescan list for next clocksource
+ *
+ * A quick helper function to be used if a clocksource changes its
+ * rating. Forces the clocksource list to be re-scanned for the best
+ * clocksource.
+ */
+void clocksource_reselect(void)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&clocksource_lock, flags);
+        next_clocksource = select_clocksource();
+        spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+EXPORT_SYMBOL(clocksource_reselect);
+/**
+ * sysfs_show_current_clocksources - sysfs interface for current clocksource
+ * @dev:        unused
+ * @buf:        char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing current clocksource.
+ */
+static ssize_t
+sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
+{
+        char *curr = buf;
+        spin_lock_irq(&clocksource_lock);
+        curr += sprintf(curr, "%s ", curr_clocksource->name);
+        spin_unlock_irq(&clocksource_lock);
+        curr += sprintf(curr, "\n");
+        return curr - buf;
+}
+/**
+ * sysfs_override_clocksource - interface for manually overriding clocksource
+ * @dev:        unused
+ * @buf:        name of override clocksource
+ * @count:      length of buffer
+ *
+ * Takes input from sysfs interface for manually overriding the default
+ * clocksource selction.
+ */
+static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+                                          const char *buf, size_t count)
+{
+        size_t ret = count;
+        /* strings from sysfs write are not 0 terminated! */
+        if (count >= sizeof(override_name))
+                return -EINVAL;
+        /* strip of \n: */
+        if (buf[count-1] == '\n')
+                count--;
+        if (count < 1)
+                return -EINVAL;
+        spin_lock_irq(&clocksource_lock);
+        /* copy the name given: */
+        memcpy(override_name, buf, count);
+        override_name[count] = 0;
+        /* try to select it: */
+        next_clocksource = select_clocksource();
+        spin_unlock_irq(&clocksource_lock);
+        return ret;
+}
+/**
+ * sysfs_show_available_clocksources - sysfs interface for listing clocksource
+ * @dev:        unused
+ * @buf:        char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing registered clocksources
+ */
+static ssize_t
+sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
+{
+        struct list_head *tmp;
+        char *curr = buf;
+        spin_lock_irq(&clocksource_lock);
+        list_for_each(tmp, &clocksource_list) {
+                struct clocksource *src;
+                src = list_entry(tmp, struct clocksource, list);
+                curr += sprintf(curr, "%s ", src->name);
+        }
+        spin_unlock_irq(&clocksource_lock);
+        curr += sprintf(curr, "\n");
+        return curr - buf;
+}
+/*
+ * Sysfs setup bits:
+ */
+static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
+                        sysfs_override_clocksource);
+static SYSDEV_ATTR(available_clocksource, 0600,
+                        sysfs_show_available_clocksources, NULL);
+static struct sysdev_class clocksource_sysclass = {
+        set_kset_name("clocksource"),
+};
+static struct sys_device device_clocksource = {
+        .id     = 0,
+        .cls    = &clocksource_sysclass,
+};
+static int __init init_clocksource_sysfs(void)
+{
+        int error = sysdev_class_register(&clocksource_sysclass);
+        if (!error)
+                error = sysdev_register(&device_clocksource);
+        if (!error)
+                error = sysdev_create_file(
+                                &device_clocksource,
+                                &attr_current_clocksource);
+        if (!error)
+                error = sysdev_create_file(
+                                &device_clocksource,
+                                &attr_available_clocksource);
+        return error;
+}
+device_initcall(init_clocksource_sysfs);
+/**
+ * boot_override_clocksource - boot clock override
+ * @str:        override name
+ *
+ * Takes a clocksource= boot argument and uses it
+ * as the clocksource override name.
+ */
+static int __init boot_override_clocksource(char* str)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&clocksource_lock, flags);
+        if (str)
+                strlcpy(override_name, str, sizeof(override_name));
+        spin_unlock_irqrestore(&clocksource_lock, flags);
+        return 1;
+}
+__setup("clocksource=", boot_override_clocksource);
+/**
+ * boot_override_clock - Compatibility layer for deprecated boot option
+ * @str:        override name
+ *
+ * DEPRECATED! Takes a clock= boot argument and uses it
+ * as the clocksource override name
+ */
+static int __init boot_override_clock(char* str)
+{
+        if (!strcmp(str, "pmtmr")) {
+                printk("Warning: clock=pmtmr is deprecated. "
+                        "Use clocksource=acpi_pm.\n");
+                return boot_override_clocksource("acpi_pm");
+        }
+        printk("Warning! clock= boot option is deprecated. "
+                "Use clocksource=xyz\n");
+        return boot_override_clocksource(str);
+}
+__setup("clock=", boot_override_clock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
new file mode 100644
index 000000000000..126bb30c4afe
--- /dev/null
+++ b/kernel/time/jiffies.c
@@ -0,0 +1,73 @@
+/***********************************************************************
+* linux/kernel/time/jiffies.c
+*
+* This file contains the jiffies based clocksource.
+*
+* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+************************************************************************/
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+/* The Jiffies based clocksource is the lowest common
+ * denominator clock source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not reccomended
+ * for "tick-less" systems.
+ */
+#define NSEC_PER_JIFFY  ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
+/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
+ * conversion, the .shift value could be zero. However
+ * this would make NTP adjustments impossible as they are
+ * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
+ * shift both the nominator and denominator the same
+ * amount, and give ntp adjustments in units of 1/2^8
+ *
+ * The value 8 is somewhat carefully chosen, as anything
+ * larger can result in overflows. NSEC_PER_JIFFY grows as
+ * HZ shrinks, so values greater then 8 overflow 32bits when
+ * HZ=100.
+ */
+#define JIFFIES_SHIFT   8
+static cycle_t jiffies_read(void)
+{
+        return (cycle_t) jiffies;
+}
+struct clocksource clocksource_jiffies = {
+        .name           = "jiffies",
+        .rating         = 0, /* lowest rating*/
+        .read           = jiffies_read,
+        .mask           = 0xffffffff, /*32bits*/
+        .mult           = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+        .shift          = JIFFIES_SHIFT,
+        .is_continuous  = 0, /* tick based, not free running */
+};
+static int __init init_jiffies_clocksource(void)
+{
+        return clocksource_register(&clocksource_jiffies);
+}
+module_init(init_jiffies_clocksource);
diff --git a/kernel/timer.c b/kernel/timer.c
index 9e49deed468c..5bb6b7976eec 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 void fastcall init_timer(struct timer_list *timer)
 {
        timer->entry.next = NULL;
-        timer->base = per_cpu(tvec_bases, raw_smp_processor_id());
+        timer->base = __raw_get_cpu_var(tvec_bases);
 }
 EXPORT_SYMBOL(init_timer);
@@ -383,23 +383,19 @@ EXPORT_SYMBOL(del_timer_sync);
 static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 {
        /* cascade all the timers from tv up one level */
-        struct list_head *head, *curr;
+        struct timer_list *timer, *tmp;
+        struct list_head tv_list;
+        list_replace_init(tv->vec + index, &tv_list);
-        head = tv->vec + index;
-        curr = head->next;
        /*
-         * We are removing _all_ timers from the list, so we don't  have to
+         * We are removing _all_ timers from the list, so we
-         * detach them individually, just clear the list afterwards.
+         * don't have to detach them individually.
         */
-        while (curr != head) {
+        list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
-                struct timer_list *tmp;
+                BUG_ON(timer->base != base);
+                internal_add_timer(base, timer);
-                tmp = list_entry(curr, struct timer_list, entry);
-                BUG_ON(tmp->base != base);
-                curr = curr->next;
-                internal_add_timer(base, tmp);
        }
-        INIT_LIST_HEAD(head);
        return index;
 }
@@ -419,10 +415,10 @@ static inline void __run_timers(tvec_base_t *base)
        spin_lock_irq(&base->lock);
        while (time_after_eq(jiffies, base->timer_jiffies)) {
-                struct list_head work_list = LIST_HEAD_INIT(work_list);
+                struct list_head work_list;
                struct list_head *head = &work_list;
                int index = base->timer_jiffies & TVR_MASK;
- 
                /*
                 * Cascade timers:
                 */
@@ -431,8 +427,8 @@ static inline void __run_timers(tvec_base_t *base)
                                (!cascade(base, &base->tv3, INDEX(1))) &&
                                        !cascade(base, &base->tv4, INDEX(2)))
                        cascade(base, &base->tv5, INDEX(3));
-                ++base->timer_jiffies; 
+                ++base->timer_jiffies;
-                list_splice_init(base->tv1.vec + index, &work_list);
+                list_replace_init(base->tv1.vec + index, &work_list);
                while (!list_empty(head)) {
                        void (*fn)(unsigned long);
                        unsigned long data;
@@ -601,7 +597,6 @@ long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
 long time_precision = 1;                /* clock precision (us)         */
 long time_maxerror = NTP_PHASE_LIMIT;   /* maximum error (us)           */
 long time_esterror = NTP_PHASE_LIMIT;   /* estimated error (us)         */
-static long time_phase;                 /* phase offset (scaled us)     */
 long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
                                        /* frequency offset (scaled ppm)*/
 static long time_adj;                   /* tick adjust (scaled 1 / HZ)  */
@@ -751,27 +746,14 @@ static long adjtime_adjustment(void)
 }
 /* in the NTP reference this is called "hardclock()" */
-static void update_wall_time_one_tick(void)
+static void update_ntp_one_tick(void)
 {
-        long time_adjust_step, delta_nsec;
+        long time_adjust_step;
        time_adjust_step = adjtime_adjustment();
        if (time_adjust_step)
                /* Reduce by this step the amount of time left  */
                time_adjust -= time_adjust_step;
-        delta_nsec = tick_nsec + time_adjust_step * 1000;
-        /*
-         * Advance the phase, once it gets to one microsecond, then
-         * advance the tick more.
-         */
-        time_phase += time_adj;
-        if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
-                long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
-                time_phase -= ltemp << (SHIFT_SCALE - 10);
-                delta_nsec += ltemp;
-        }
-        xtime.tv_nsec += delta_nsec;
-        time_interpolator_update(delta_nsec);
        /* Changes by adjtime() do not take effect till next tick. */
        if (time_next_adjust != 0) {
@@ -784,36 +766,378 @@ static void update_wall_time_one_tick(void)
 * Return how long ticks are at the moment, that is, how much time
 * update_wall_time_one_tick will add to xtime next time we call it
 * (assuming no calls to do_adjtimex in the meantime).
- * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10
+ * The return value is in fixed-point nanoseconds shifted by the
- * bits to the right of the binary point.
+ * specified number of bits to the right of the binary point.
 * This function has no side-effects.
 */
 u64 current_tick_length(void)
 {
        long delta_nsec;
+        u64 ret;
+        /* calculate the finest interval NTP will allow.
+         *    ie: nanosecond value shifted by (SHIFT_SCALE - 10)
+         */
        delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
-        return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj;
+        ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
+        ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
+        return ret;
 }
-/*
+/* XXX - all of this timekeeping code should be later moved to time.c */
- * Using a loop looks inefficient, but "ticks" is
+#include <linux/clocksource.h>
- * usually just one (we shouldn't be losing ticks,
+static struct clocksource *clock; /* pointer to current clocksource */
- * we're doing this this way mainly for interrupt
- * latency reasons, not because we think we'll
+#ifdef CONFIG_GENERIC_TIME
- * have lots of lost timer ticks
+/**
+ * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ *
+ * private function, must hold xtime_lock lock when being
+ * called. Returns the number of nanoseconds since the
+ * last call to update_wall_time() (adjusted by NTP scaling)
+ */
+static inline s64 __get_nsec_offset(void)
+{
+        cycle_t cycle_now, cycle_delta;
+        s64 ns_offset;
+        /* read clocksource: */
+        cycle_now = clocksource_read(clock);
+        /* calculate the delta since the last update_wall_time: */
+        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+        /* convert to nanoseconds: */
+        ns_offset = cyc2ns(clock, cycle_delta);
+        return ns_offset;
+}
+/**
+ * __get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts:         pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec. Used by
+ * do_gettimeofday() and get_realtime_clock_ts().
+ */
+static inline void __get_realtime_clock_ts(struct timespec *ts)
+{
+        unsigned long seq;
+        s64 nsecs;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                *ts = xtime;
+                nsecs = __get_nsec_offset();
+        } while (read_seqretry(&xtime_lock, seq));
+        timespec_add_ns(ts, nsecs);
+}
+/**
+ * getnstimeofday - Returns the time of day in a timespec
+ * @ts:         pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void getnstimeofday(struct timespec *ts)
+{
+        __get_realtime_clock_ts(ts);
+}
+EXPORT_SYMBOL(getnstimeofday);
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv:         pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using get_realtime_clock_ts()
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+        struct timespec now;
+        __get_realtime_clock_ts(&now);
+        tv->tv_sec = now.tv_sec;
+        tv->tv_usec = now.tv_nsec/1000;
+}
+EXPORT_SYMBOL(do_gettimeofday);
+/**
+ * do_settimeofday - Sets the time of day
+ * @tv:         pointer to the timespec variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify hrtimers
+ */
+int do_settimeofday(struct timespec *tv)
+{
+        unsigned long flags;
+        time_t wtm_sec, sec = tv->tv_sec;
+        long wtm_nsec, nsec = tv->tv_nsec;
+        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+                return -EINVAL;
+        write_seqlock_irqsave(&xtime_lock, flags);
+        nsec -= __get_nsec_offset();
+        wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+        wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+        set_normalized_timespec(&xtime, sec, nsec);
+        set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+        ntp_clear();
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+        /* signal hrtimers about time change */
+        clock_was_set();
+        return 0;
+}
+EXPORT_SYMBOL(do_settimeofday);
+/**
+ * change_clocksource - Swaps clocksources if a new one is available
+ *
+ * Accumulates current time interval and initializes new clocksource
+ */
+static int change_clocksource(void)
+{
+        struct clocksource *new;
+        cycle_t now;
+        u64 nsec;
+        new = clocksource_get_next();
+        if (clock != new) {
+                now = clocksource_read(new);
+                nsec =  __get_nsec_offset();
+                timespec_add_ns(&xtime, nsec);
+                clock = new;
+                clock->cycle_last = now;
+                printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+                                        clock->name);
+                return 1;
+        } else if (clock->update_callback) {
+                return clock->update_callback();
+        }
+        return 0;
+}
+#else
+#define change_clocksource() (0)
+#endif
+/**
+ * timeofday_is_continuous - check to see if timekeeping is free running
 */
-static void update_wall_time(unsigned long ticks)
+int timekeeping_is_continuous(void)
 {
+        unsigned long seq;
+        int ret;
        do {
-                ticks--;
+                seq = read_seqbegin(&xtime_lock);
-                update_wall_time_one_tick();
-                if (xtime.tv_nsec >= 1000000000) {
+                ret = clock->is_continuous;
-                        xtime.tv_nsec -= 1000000000;
+        } while (read_seqretry(&xtime_lock, seq));
+        return ret;
+}
+/*
+ * timekeeping_init - Initializes the clocksource and common timekeeping values
+ */
+void __init timekeeping_init(void)
+{
+        unsigned long flags;
+        write_seqlock_irqsave(&xtime_lock, flags);
+        clock = clocksource_get_next();
+        clocksource_calculate_interval(clock, tick_nsec);
+        clock->cycle_last = clocksource_read(clock);
+        ntp_clear();
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+/*
+ * timekeeping_resume - Resumes the generic timekeeping subsystem.
+ * @dev:        unused
+ *
+ * This is for the generic clocksource timekeeping.
+ * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are
+ * still managed by arch specific suspend/resume code.
+ */
+static int timekeeping_resume(struct sys_device *dev)
+{
+        unsigned long flags;
+        write_seqlock_irqsave(&xtime_lock, flags);
+        /* restart the last cycle value */
+        clock->cycle_last = clocksource_read(clock);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+        return 0;
+}
+/* sysfs resume/suspend bits for timekeeping */
+static struct sysdev_class timekeeping_sysclass = {
+        .resume         = timekeeping_resume,
+        set_kset_name("timekeeping"),
+};
+static struct sys_device device_timer = {
+        .id             = 0,
+        .cls            = &timekeeping_sysclass,
+};
+static int __init timekeeping_init_device(void)
+{
+        int error = sysdev_class_register(&timekeeping_sysclass);
+        if (!error)
+                error = sysdev_register(&device_timer);
+        return error;
+}
+device_initcall(timekeeping_init_device);
+/*
+ * If the error is already larger, we look ahead another tick,
+ * to compensate for late or lost adjustments.
+ */
+static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset)
+{
+        int adj;
+        /*
+         * As soon as the machine is synchronized to the external time
+         * source this should be the common case.
+         */
+        error >>= 2;
+        if (likely(sign > 0 ? error <= *interval : error >= *interval))
+                return sign;
+        /*
+         * An extra look ahead dampens the effect of the current error,
+         * which can grow quite large with continously late updates, as
+         * it would dominate the adjustment value and can lead to
+         * oscillation.
+         */
+        error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
+        error -= clock->xtime_interval >> 1;
+        adj = 0;
+        while (1) {
+                error >>= 1;
+                if (sign > 0 ? error <= *interval : error >= *interval)
+                        break;
+                adj++;
+        }
+        /*
+         * Add the current adjustments to the error and take the offset
+         * into account, the latter can cause the error to be hardly
+         * reduced at the next tick. Check the error again if there's
+         * room for another adjustment, thus further reducing the error
+         * which otherwise had to be corrected at the next update.
+         */
+        error = (error << 1) - *interval + *offset;
+        if (sign > 0 ? error > *interval : error < *interval)
+                adj++;
+        *interval <<= adj;
+        *offset <<= adj;
+        return sign << adj;
+}
+/*
+ * Adjust the multiplier to reduce the error value,
+ * this is optimized for the most common adjustments of -1,0,1,
+ * for other values we can do a bit more work.
+ */
+static void clocksource_adjust(struct clocksource *clock, s64 offset)
+{
+        s64 error, interval = clock->cycle_interval;
+        int adj;
+        error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
+        if (error > interval) {
+                adj = clocksource_bigadjust(1, error, &interval, &offset);
+        } else if (error < -interval) {
+                interval = -interval;
+                offset = -offset;
+                adj = clocksource_bigadjust(-1, error, &interval, &offset);
+        } else
+                return;
+        clock->mult += adj;
+        clock->xtime_interval += interval;
+        clock->xtime_nsec -= offset;
+        clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
+}
+/*
+ * update_wall_time - Uses the current clocksource to increment the wall time
+ *
+ * Called from the timer interrupt, must hold a write on xtime_lock.
+ */
+static void update_wall_time(void)
+{
+        cycle_t offset;
+        clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+#ifdef CONFIG_GENERIC_TIME
+        offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+#else
+        offset = clock->cycle_interval;
+#endif
+        /* normally this loop will run just once, however in the
+         * case of lost or late ticks, it will accumulate correctly.
+         */
+        while (offset >= clock->cycle_interval) {
+                /* accumulate one interval */
+                clock->xtime_nsec += clock->xtime_interval;
+                clock->cycle_last += clock->cycle_interval;
+                offset -= clock->cycle_interval;
+                if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
+                        clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
                        xtime.tv_sec++;
                        second_overflow();
                }
-        } while (ticks);
+                /* interpolator bits */
+                time_interpolator_update(clock->xtime_interval
+                                                >> clock->shift);
+                /* increment the NTP state machine */
+                update_ntp_one_tick();
+                /* accumulate error between NTP and clock interval */
+                clock->error += current_tick_length();
+                clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
+        }
+        /* correct the clock when NTP error is too big */
+        clocksource_adjust(clock, offset);
+        /* store full nanoseconds into xtime */
+        xtime.tv_nsec = clock->xtime_nsec >> clock->shift;
+        clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+        /* check to see if there is a new clocksource to use */
+        if (change_clocksource()) {
+                clock->error = 0;
+                clock->xtime_nsec = 0;
+                clocksource_calculate_interval(clock, tick_nsec);
+        }
 }
 /*
@@ -919,10 +1243,8 @@ static inline void update_times(void)
        unsigned long ticks;
        ticks = jiffies - wall_jiffies;
-        if (ticks) {
+        wall_jiffies += ticks;
-                wall_jiffies += ticks;
+        update_wall_time();
-                update_wall_time(ticks);
-        }
        calc_load(ticks);
 }
  
diff --git a/kernel/unwind.c b/kernel/unwind.c
new file mode 100644
index 000000000000..f69c804c8e62
--- /dev/null
+++ b/kernel/unwind.c
@@ -0,0 +1,918 @@
+/*
+ * Copyright (C) 2002-2006 Novell, Inc.
+ *      Jan Beulich <jbeulich@novell.com>
+ * This code is released under version 2 of the GNU GPL.
+ *
+ * A simple API for unwinding kernel stacks.  This is used for
+ * debugging and error reporting purposes.  The kernel doesn't need
+ * full-blown stack unwinding with all the bells and whistles, so there
+ * is not much point in implementing the full Dwarf2 unwind API.
+ */
+#include <linux/unwind.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/stop_machine.h>
+#include <asm/sections.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+extern char __start_unwind[], __end_unwind[];
+#define MAX_STACK_DEPTH 8
+#define EXTRA_INFO(f) { \
+                BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \
+                                  % FIELD_SIZEOF(struct unwind_frame_info, f)) \
+                + offsetof(struct unwind_frame_info, f) \
+                  / FIELD_SIZEOF(struct unwind_frame_info, f), \
+                FIELD_SIZEOF(struct unwind_frame_info, f) \
+        }
+#define PTREGS_INFO(f) EXTRA_INFO(regs.f)
+static const struct {
+        unsigned offs:BITS_PER_LONG / 2;
+        unsigned width:BITS_PER_LONG / 2;
+} reg_info[] = {
+        UNW_REGISTER_INFO
+};
+#undef PTREGS_INFO
+#undef EXTRA_INFO
+#ifndef REG_INVALID
+#define REG_INVALID(r) (reg_info[r].width == 0)
+#endif
+#define DW_CFA_nop                          0x00
+#define DW_CFA_set_loc                      0x01
+#define DW_CFA_advance_loc1                 0x02
+#define DW_CFA_advance_loc2                 0x03
+#define DW_CFA_advance_loc4                 0x04
+#define DW_CFA_offset_extended              0x05
+#define DW_CFA_restore_extended             0x06
+#define DW_CFA_undefined                    0x07
+#define DW_CFA_same_value                   0x08
+#define DW_CFA_register                     0x09
+#define DW_CFA_remember_state               0x0a
+#define DW_CFA_restore_state                0x0b
+#define DW_CFA_def_cfa                      0x0c
+#define DW_CFA_def_cfa_register             0x0d
+#define DW_CFA_def_cfa_offset               0x0e
+#define DW_CFA_def_cfa_expression           0x0f
+#define DW_CFA_expression                   0x10
+#define DW_CFA_offset_extended_sf           0x11
+#define DW_CFA_def_cfa_sf                   0x12
+#define DW_CFA_def_cfa_offset_sf            0x13
+#define DW_CFA_val_offset                   0x14
+#define DW_CFA_val_offset_sf                0x15
+#define DW_CFA_val_expression               0x16
+#define DW_CFA_lo_user                      0x1c
+#define DW_CFA_GNU_window_save              0x2d
+#define DW_CFA_GNU_args_size                0x2e
+#define DW_CFA_GNU_negative_offset_extended 0x2f
+#define DW_CFA_hi_user                      0x3f
+#define DW_EH_PE_FORM     0x07
+#define DW_EH_PE_native   0x00
+#define DW_EH_PE_leb128   0x01
+#define DW_EH_PE_data2    0x02
+#define DW_EH_PE_data4    0x03
+#define DW_EH_PE_data8    0x04
+#define DW_EH_PE_signed   0x08
+#define DW_EH_PE_ADJUST   0x70
+#define DW_EH_PE_abs      0x00
+#define DW_EH_PE_pcrel    0x10
+#define DW_EH_PE_textrel  0x20
+#define DW_EH_PE_datarel  0x30
+#define DW_EH_PE_funcrel  0x40
+#define DW_EH_PE_aligned  0x50
+#define DW_EH_PE_indirect 0x80
+#define DW_EH_PE_omit     0xff
+typedef unsigned long uleb128_t;
+typedef   signed long sleb128_t;
+static struct unwind_table {
+        struct {
+                unsigned long pc;
+                unsigned long range;
+        } core, init;
+        const void *address;
+        unsigned long size;
+        struct unwind_table *link;
+        const char *name;
+} root_table, *last_table;
+struct unwind_item {
+        enum item_location {
+                Nowhere,
+                Memory,
+                Register,
+                Value
+        } where;
+        uleb128_t value;
+};
+struct unwind_state {
+        uleb128_t loc, org;
+        const u8 *cieStart, *cieEnd;
+        uleb128_t codeAlign;
+        sleb128_t dataAlign;
+        struct cfa {
+                uleb128_t reg, offs;
+        } cfa;
+        struct unwind_item regs[ARRAY_SIZE(reg_info)];
+        unsigned stackDepth:8;
+        unsigned version:8;
+        const u8 *label;
+        const u8 *stack[MAX_STACK_DEPTH];
+};
+static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
+static struct unwind_table *find_table(unsigned long pc)
+{
+        struct unwind_table *table;
+        for (table = &root_table; table; table = table->link)
+                if ((pc >= table->core.pc
+                     && pc < table->core.pc + table->core.range)
+                    || (pc >= table->init.pc
+                        && pc < table->init.pc + table->init.range))
+                        break;
+        return table;
+}
+static void init_unwind_table(struct unwind_table *table,
+                              const char *name,
+                              const void *core_start,
+                              unsigned long core_size,
+                              const void *init_start,
+                              unsigned long init_size,
+                              const void *table_start,
+                              unsigned long table_size)
+{
+        table->core.pc = (unsigned long)core_start;
+        table->core.range = core_size;
+        table->init.pc = (unsigned long)init_start;
+        table->init.range = init_size;
+        table->address = table_start;
+        table->size = table_size;
+        table->link = NULL;
+        table->name = name;
+}
+void __init unwind_init(void)
+{
+        init_unwind_table(&root_table, "kernel",
+                          _text, _end - _text,
+                          NULL, 0,
+                          __start_unwind, __end_unwind - __start_unwind);
+}
+#ifdef CONFIG_MODULES
+/* Must be called with module_mutex held. */
+void *unwind_add_table(struct module *module,
+                       const void *table_start,
+                       unsigned long table_size)
+{
+        struct unwind_table *table;
+        if (table_size <= 0)
+                return NULL;
+        table = kmalloc(sizeof(*table), GFP_KERNEL);
+        if (!table)
+                return NULL;
+        init_unwind_table(table, module->name,
+                          module->module_core, module->core_size,
+                          module->module_init, module->init_size,
+                          table_start, table_size);
+        if (last_table)
+                last_table->link = table;
+        else
+                root_table.link = table;
+        last_table = table;
+        return table;
+}
+struct unlink_table_info
+{
+        struct unwind_table *table;
+        int init_only;
+};
+static int unlink_table(void *arg)
+{
+        struct unlink_table_info *info = arg;
+        struct unwind_table *table = info->table, *prev;
+        for (prev = &root_table; prev->link && prev->link != table; prev = prev->link)
+                ;
+        if (prev->link) {
+                if (info->init_only) {
+                        table->init.pc = 0;
+                        table->init.range = 0;
+                        info->table = NULL;
+                } else {
+                        prev->link = table->link;
+                        if (!prev->link)
+                                last_table = prev;
+                }
+        } else
+                info->table = NULL;
+        return 0;
+}
+/* Must be called with module_mutex held. */
+void unwind_remove_table(void *handle, int init_only)
+{
+        struct unwind_table *table = handle;
+        struct unlink_table_info info;
+        if (!table || table == &root_table)
+                return;
+        if (init_only && table == last_table) {
+                table->init.pc = 0;
+                table->init.range = 0;
+                return;
+        }
+        info.table = table;
+        info.init_only = init_only;
+        stop_machine_run(unlink_table, &info, NR_CPUS);
+        if (info.table)
+                kfree(table);
+}
+#endif /* CONFIG_MODULES */
+static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
+{
+        const u8 *cur = *pcur;
+        uleb128_t value;
+        unsigned shift;
+        for (shift = 0, value = 0; cur < end; shift += 7) {
+                if (shift + 7 > 8 * sizeof(value)
+                    && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
+                        cur = end + 1;
+                        break;
+                }
+                value |= (uleb128_t)(*cur & 0x7f) << shift;
+                if (!(*cur++ & 0x80))
+                        break;
+        }
+        *pcur = cur;
+        return value;
+}
+static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
+{
+        const u8 *cur = *pcur;
+        sleb128_t value;
+        unsigned shift;
+        for (shift = 0, value = 0; cur < end; shift += 7) {
+                if (shift + 7 > 8 * sizeof(value)
+                    && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
+                        cur = end + 1;
+                        break;
+                }
+                value |= (sleb128_t)(*cur & 0x7f) << shift;
+                if (!(*cur & 0x80)) {
+                        value |= -(*cur++ & 0x40) << shift;
+                        break;
+                }
+        }
+        *pcur = cur;
+        return value;
+}
+static unsigned long read_pointer(const u8 **pLoc,
+                                  const void *end,
+                                  signed ptrType)
+{
+        unsigned long value = 0;
+        union {
+                const u8 *p8;
+                const u16 *p16u;
+                const s16 *p16s;
+                const u32 *p32u;
+                const s32 *p32s;
+                const unsigned long *pul;
+        } ptr;
+        if (ptrType < 0 || ptrType == DW_EH_PE_omit)
+                return 0;
+        ptr.p8 = *pLoc;
+        switch(ptrType & DW_EH_PE_FORM) {
+        case DW_EH_PE_data2:
+                if (end < (const void *)(ptr.p16u + 1))
+                        return 0;
+                if(ptrType & DW_EH_PE_signed)
+                        value = get_unaligned(ptr.p16s++);
+                else
+                        value = get_unaligned(ptr.p16u++);
+                break;
+        case DW_EH_PE_data4:
+#ifdef CONFIG_64BIT
+                if (end < (const void *)(ptr.p32u + 1))
+                        return 0;
+                if(ptrType & DW_EH_PE_signed)
+                        value = get_unaligned(ptr.p32s++);
+                else
+                        value = get_unaligned(ptr.p32u++);
+                break;
+        case DW_EH_PE_data8:
+                BUILD_BUG_ON(sizeof(u64) != sizeof(value));
+#else
+                BUILD_BUG_ON(sizeof(u32) != sizeof(value));
+#endif
+        case DW_EH_PE_native:
+                if (end < (const void *)(ptr.pul + 1))
+                        return 0;
+                value = get_unaligned(ptr.pul++);
+                break;
+        case DW_EH_PE_leb128:
+                BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value));
+                value = ptrType & DW_EH_PE_signed
+                        ? get_sleb128(&ptr.p8, end)
+                        : get_uleb128(&ptr.p8, end);
+                if ((const void *)ptr.p8 > end)
+                        return 0;
+                break;
+        default:
+                return 0;
+        }
+        switch(ptrType & DW_EH_PE_ADJUST) {
+        case DW_EH_PE_abs:
+                break;
+        case DW_EH_PE_pcrel:
+                value += (unsigned long)*pLoc;
+                break;
+        default:
+                return 0;
+        }
+        if ((ptrType & DW_EH_PE_indirect)
+            && __get_user(value, (unsigned long *)value))
+                return 0;
+        *pLoc = ptr.p8;
+        return value;
+}
+static signed fde_pointer_type(const u32 *cie)
+{
+        const u8 *ptr = (const u8 *)(cie + 2);
+        unsigned version = *ptr;
+        if (version != 1)
+                return -1; /* unsupported */
+        if (*++ptr) {
+                const char *aug;
+                const u8 *end = (const u8 *)(cie + 1) + *cie;
+                uleb128_t len;
+                /* check if augmentation size is first (and thus present) */
+                if (*ptr != 'z')
+                        return -1;
+                /* check if augmentation string is nul-terminated */
+                if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL)
+                        return -1;
+                ++ptr; /* skip terminator */
+                get_uleb128(&ptr, end); /* skip code alignment */
+                get_sleb128(&ptr, end); /* skip data alignment */
+                /* skip return address column */
+                version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end);
+                len = get_uleb128(&ptr, end); /* augmentation length */
+                if (ptr + len < ptr || ptr + len > end)
+                        return -1;
+                end = ptr + len;
+                while (*++aug) {
+                        if (ptr >= end)
+                                return -1;
+                        switch(*aug) {
+                        case 'L':
+                                ++ptr;
+                                break;
+                        case 'P': {
+                                        signed ptrType = *ptr++;
+                                        if (!read_pointer(&ptr, end, ptrType) || ptr > end)
+                                                return -1;
+                                }
+                                break;
+                        case 'R':
+                                return *ptr;
+                        default:
+                                return -1;
+                        }
+                }
+        }
+        return DW_EH_PE_native|DW_EH_PE_abs;
+}
+static int advance_loc(unsigned long delta, struct unwind_state *state)
+{
+        state->loc += delta * state->codeAlign;
+        return delta > 0;
+}
+static void set_rule(uleb128_t reg,
+                     enum item_location where,
+                     uleb128_t value,
+                     struct unwind_state *state)
+{
+        if (reg < ARRAY_SIZE(state->regs)) {
+                state->regs[reg].where = where;
+                state->regs[reg].value = value;
+        }
+}
+static int processCFI(const u8 *start,
+                      const u8 *end,
+                      unsigned long targetLoc,
+                      signed ptrType,
+                      struct unwind_state *state)
+{
+        union {
+                const u8 *p8;
+                const u16 *p16;
+                const u32 *p32;
+        } ptr;
+        int result = 1;
+        if (start != state->cieStart) {
+                state->loc = state->org;
+                result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state);
+                if (targetLoc == 0 && state->label == NULL)
+                        return result;
+        }
+        for (ptr.p8 = start; result && ptr.p8 < end; ) {
+                switch(*ptr.p8 >> 6) {
+                        uleb128_t value;
+                case 0:
+                        switch(*ptr.p8++) {
+                        case DW_CFA_nop:
+                                break;
+                        case DW_CFA_set_loc:
+                                if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
+                                        result = 0;
+                                break;
+                        case DW_CFA_advance_loc1:
+                                result = ptr.p8 < end && advance_loc(*ptr.p8++, state);
+                                break;
+                        case DW_CFA_advance_loc2:
+                                result = ptr.p8 <= end + 2
+                                         && advance_loc(*ptr.p16++, state);
+                                break;
+                        case DW_CFA_advance_loc4:
+                                result = ptr.p8 <= end + 4
+                                         && advance_loc(*ptr.p32++, state);
+                                break;
+                        case DW_CFA_offset_extended:
+                                value = get_uleb128(&ptr.p8, end);
+                                set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
+                                break;
+                        case DW_CFA_val_offset:
+                                value = get_uleb128(&ptr.p8, end);
+                                set_rule(value, Value, get_uleb128(&ptr.p8, end), state);
+                                break;
+                        case DW_CFA_offset_extended_sf:
+                                value = get_uleb128(&ptr.p8, end);
+                                set_rule(value, Memory, get_sleb128(&ptr.p8, end), state);
+                                break;
+                        case DW_CFA_val_offset_sf:
+                                value = get_uleb128(&ptr.p8, end);
+                                set_rule(value, Value, get_sleb128(&ptr.p8, end), state);
+                                break;
+                        case DW_CFA_restore_extended:
+                        case DW_CFA_undefined:
+                        case DW_CFA_same_value:
+                                set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state);
+                                break;
+                        case DW_CFA_register:
+                                value = get_uleb128(&ptr.p8, end);
+                                set_rule(value,
+                                         Register,
+                                         get_uleb128(&ptr.p8, end), state);
+                                break;
+                        case DW_CFA_remember_state:
+                                if (ptr.p8 == state->label) {
+                                        state->label = NULL;
+                                        return 1;
+                                }
+                                if (state->stackDepth >= MAX_STACK_DEPTH)
+                                        return 0;
+                                state->stack[state->stackDepth++] = ptr.p8;
+                                break;
+                        case DW_CFA_restore_state:
+                                if (state->stackDepth) {
+                                        const uleb128_t loc = state->loc;
+                                        const u8 *label = state->label;
+                                        state->label = state->stack[state->stackDepth - 1];
+                                        memcpy(&state->cfa, &badCFA, sizeof(state->cfa));
+                                        memset(state->regs, 0, sizeof(state->regs));
+                                        state->stackDepth = 0;
+                                        result = processCFI(start, end, 0, ptrType, state);
+                                        state->loc = loc;
+                                        state->label = label;
+                                } else
+                                        return 0;
+                                break;
+                        case DW_CFA_def_cfa:
+                                state->cfa.reg = get_uleb128(&ptr.p8, end);
+                                /*nobreak*/
+                        case DW_CFA_def_cfa_offset:
+                                state->cfa.offs = get_uleb128(&ptr.p8, end);
+                                break;
+                        case DW_CFA_def_cfa_sf:
+                                state->cfa.reg = get_uleb128(&ptr.p8, end);
+                                /*nobreak*/
+                        case DW_CFA_def_cfa_offset_sf:
+                                state->cfa.offs = get_sleb128(&ptr.p8, end)
+                                                  * state->dataAlign;
+                                break;
+                        case DW_CFA_def_cfa_register:
+                                state->cfa.reg = get_uleb128(&ptr.p8, end);
+                                break;
+                        /*todo case DW_CFA_def_cfa_expression: */
+                        /*todo case DW_CFA_expression: */
+                        /*todo case DW_CFA_val_expression: */
+                        case DW_CFA_GNU_args_size:
+                                get_uleb128(&ptr.p8, end);
+                                break;
+                        case DW_CFA_GNU_negative_offset_extended:
+                                value = get_uleb128(&ptr.p8, end);
+                                set_rule(value,
+                                         Memory,
+                                         (uleb128_t)0 - get_uleb128(&ptr.p8, end), state);
+                                break;
+                        case DW_CFA_GNU_window_save:
+                        default:
+                                result = 0;
+                                break;
+                        }
+                        break;
+                case 1:
+                        result = advance_loc(*ptr.p8++ & 0x3f, state);
+                        break;
+                case 2:
+                        value = *ptr.p8++ & 0x3f;
+                        set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
+                        break;
+                case 3:
+                        set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
+                        break;
+                }
+                if (ptr.p8 > end)
+                        result = 0;
+                if (result && targetLoc != 0 && targetLoc < state->loc)
+                        return 1;
+        }
+        return result
+           && ptr.p8 == end
+           && (targetLoc == 0
+            || (/*todo While in theory this should apply, gcc in practice omits
+                  everything past the function prolog, and hence the location
+                  never reaches the end of the function.
+                targetLoc < state->loc &&*/ state->label == NULL));
+}
+/* Unwind to previous to frame.  Returns 0 if successful, negative
+ * number in case of an error. */
+int unwind(struct unwind_frame_info *frame)
+{
+#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
+        const u32 *fde = NULL, *cie = NULL;
+        const u8 *ptr = NULL, *end = NULL;
+        unsigned long startLoc = 0, endLoc = 0, cfa;
+        unsigned i;
+        signed ptrType = -1;
+        uleb128_t retAddrReg = 0;
+        struct unwind_table *table;
+        struct unwind_state state;
+        if (UNW_PC(frame) == 0)
+                return -EINVAL;
+        if ((table = find_table(UNW_PC(frame))) != NULL
+            && !(table->size & (sizeof(*fde) - 1))) {
+                unsigned long tableSize = table->size;
+                for (fde = table->address;
+                     tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
+                     tableSize -= sizeof(*fde) + *fde,
+                     fde += 1 + *fde / sizeof(*fde)) {
+                        if (!*fde || (*fde & (sizeof(*fde) - 1)))
+                                break;
+                        if (!fde[1])
+                                continue; /* this is a CIE */
+                        if ((fde[1] & (sizeof(*fde) - 1))
+                            || fde[1] > (unsigned long)(fde + 1)
+                                        - (unsigned long)table->address)
+                                continue; /* this is not a valid FDE */
+                        cie = fde + 1 - fde[1] / sizeof(*fde);
+                        if (*cie <= sizeof(*cie) + 4
+                            || *cie >= fde[1] - sizeof(*fde)
+                            || (*cie & (sizeof(*cie) - 1))
+                            || cie[1]
+                            || (ptrType = fde_pointer_type(cie)) < 0) {
+                                cie = NULL; /* this is not a (valid) CIE */
+                                continue;
+                        }
+                        ptr = (const u8 *)(fde + 2);
+                        startLoc = read_pointer(&ptr,
+                                                (const u8 *)(fde + 1) + *fde,
+                                                ptrType);
+                        endLoc = startLoc
+                                 + read_pointer(&ptr,
+                                                (const u8 *)(fde + 1) + *fde,
+                                                ptrType & DW_EH_PE_indirect
+                                                ? ptrType
+                                                : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
+                        if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc)
+                                break;
+                        cie = NULL;
+                }
+        }
+        if (cie != NULL) {
+                memset(&state, 0, sizeof(state));
+                state.cieEnd = ptr; /* keep here temporarily */
+                ptr = (const u8 *)(cie + 2);
+                end = (const u8 *)(cie + 1) + *cie;
+                if ((state.version = *ptr) != 1)
+                        cie = NULL; /* unsupported version */
+                else if (*++ptr) {
+                        /* check if augmentation size is first (and thus present) */
+                        if (*ptr == 'z') {
+                                /* check for ignorable (or already handled)
+                                 * nul-terminated augmentation string */
+                                while (++ptr < end && *ptr)
+                                        if (strchr("LPR", *ptr) == NULL)
+                                                break;
+                        }
+                        if (ptr >= end || *ptr)
+                                cie = NULL;
+                }
+                ++ptr;
+        }
+        if (cie != NULL) {
+                /* get code aligment factor */
+                state.codeAlign = get_uleb128(&ptr, end);
+                /* get data aligment factor */
+                state.dataAlign = get_sleb128(&ptr, end);
+                if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
+                        cie = NULL;
+                else {
+                        retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
+                        /* skip augmentation */
+                        if (((const char *)(cie + 2))[1] == 'z')
+                                ptr += get_uleb128(&ptr, end);
+                        if (ptr > end
+                           || retAddrReg >= ARRAY_SIZE(reg_info)
+                           || REG_INVALID(retAddrReg)
+                           || reg_info[retAddrReg].width != sizeof(unsigned long))
+                                cie = NULL;
+                }
+        }
+        if (cie != NULL) {
+                state.cieStart = ptr;
+                ptr = state.cieEnd;
+                state.cieEnd = end;
+                end = (const u8 *)(fde + 1) + *fde;
+                /* skip augmentation */
+                if (((const char *)(cie + 2))[1] == 'z') {
+                        uleb128_t augSize = get_uleb128(&ptr, end);
+                        if ((ptr += augSize) > end)
+                                fde = NULL;
+                }
+        }
+        if (cie == NULL || fde == NULL) {
+#ifdef CONFIG_FRAME_POINTER
+                unsigned long top, bottom;
+#endif
+#ifdef CONFIG_FRAME_POINTER
+                top = STACK_TOP(frame->task);
+                bottom = STACK_BOTTOM(frame->task);
+# if FRAME_RETADDR_OFFSET < 0
+                if (UNW_SP(frame) < top
+                    && UNW_FP(frame) <= UNW_SP(frame)
+                    && bottom < UNW_FP(frame)
+# else
+                if (UNW_SP(frame) > top
+                    && UNW_FP(frame) >= UNW_SP(frame)
+                    && bottom > UNW_FP(frame)
+# endif
+                   && !((UNW_SP(frame) | UNW_FP(frame))
+                        & (sizeof(unsigned long) - 1))) {
+                        unsigned long link;
+                        if (!__get_user(link,
+                                        (unsigned long *)(UNW_FP(frame)
+                                                          + FRAME_LINK_OFFSET))
+# if FRAME_RETADDR_OFFSET < 0
+                           && link > bottom && link < UNW_FP(frame)
+# else
+                           && link > UNW_FP(frame) && link < bottom
+# endif
+                           && !(link & (sizeof(link) - 1))
+                           && !__get_user(UNW_PC(frame),
+                                          (unsigned long *)(UNW_FP(frame)
+                                                            + FRAME_RETADDR_OFFSET))) {
+                                UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
+# if FRAME_RETADDR_OFFSET < 0
+                                        -
+# else
+                                        +
+# endif
+                                          sizeof(UNW_PC(frame));
+                                UNW_FP(frame) = link;
+                                return 0;
+                        }
+                }
+#endif
+                return -ENXIO;
+        }
+        state.org = startLoc;
+        memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
+        /* process instructions */
+        if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state)
+           || state.loc > endLoc
+           || state.regs[retAddrReg].where == Nowhere
+           || state.cfa.reg >= ARRAY_SIZE(reg_info)
+           || reg_info[state.cfa.reg].width != sizeof(unsigned long)
+           || state.cfa.offs % sizeof(unsigned long))
+                return -EIO;
+        /* update frame */
+        cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
+        startLoc = min((unsigned long)UNW_SP(frame), cfa);
+        endLoc = max((unsigned long)UNW_SP(frame), cfa);
+        if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) {
+                startLoc = min(STACK_LIMIT(cfa), cfa);
+                endLoc = max(STACK_LIMIT(cfa), cfa);
+        }
+#ifndef CONFIG_64BIT
+# define CASES CASE(8); CASE(16); CASE(32)
+#else
+# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
+#endif
+        for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
+                if (REG_INVALID(i)) {
+                        if (state.regs[i].where == Nowhere)
+                                continue;
+                        return -EIO;
+                }
+                switch(state.regs[i].where) {
+                default:
+                        break;
+                case Register:
+                        if (state.regs[i].value >= ARRAY_SIZE(reg_info)
+                           || REG_INVALID(state.regs[i].value)
+                           || reg_info[i].width > reg_info[state.regs[i].value].width)
+                                return -EIO;
+                        switch(reg_info[state.regs[i].value].width) {
+#define CASE(n) \
+                        case sizeof(u##n): \
+                                state.regs[i].value = FRAME_REG(state.regs[i].value, \
+                                                                const u##n); \
+                                break
+                        CASES;
+#undef CASE
+                        default:
+                                return -EIO;
+                        }
+                        break;
+                }
+        }
+        for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
+                if (REG_INVALID(i))
+                        continue;
+                switch(state.regs[i].where) {
+                case Nowhere:
+                        if (reg_info[i].width != sizeof(UNW_SP(frame))
+                           || &FRAME_REG(i, __typeof__(UNW_SP(frame)))
+                              != &UNW_SP(frame))
+                                continue;
+                        UNW_SP(frame) = cfa;
+                        break;
+                case Register:
+                        switch(reg_info[i].width) {
+#define CASE(n) case sizeof(u##n): \
+                                FRAME_REG(i, u##n) = state.regs[i].value; \
+                                break
+                        CASES;
+#undef CASE
+                        default:
+                                return -EIO;
+                        }
+                        break;
+                case Value:
+                        if (reg_info[i].width != sizeof(unsigned long))
+                                return -EIO;
+                        FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
+                                                            * state.dataAlign;
+                        break;
+                case Memory: {
+                                unsigned long addr = cfa + state.regs[i].value
+                                                           * state.dataAlign;
+                                if ((state.regs[i].value * state.dataAlign)
+                                    % sizeof(unsigned long)
+                                    || addr < startLoc
+                                    || addr + sizeof(unsigned long) < addr
+                                    || addr + sizeof(unsigned long) > endLoc)
+                                        return -EIO;
+                                switch(reg_info[i].width) {
+#define CASE(n)     case sizeof(u##n): \
+                                        __get_user(FRAME_REG(i, u##n), (u##n *)addr); \
+                                        break
+                                CASES;
+#undef CASE
+                                default:
+                                        return -EIO;
+                                }
+                        }
+                        break;
+                }
+        }
+        return 0;
+#undef CASES
+#undef FRAME_REG
+}
+EXPORT_SYMBOL(unwind);
+int unwind_init_frame_info(struct unwind_frame_info *info,
+                           struct task_struct *tsk,
+                           /*const*/ struct pt_regs *regs)
+{
+        info->task = tsk;
+        arch_unw_init_frame_info(info, regs);
+        return 0;
+}
+EXPORT_SYMBOL(unwind_init_frame_info);
+/*
+ * Prepare to unwind a blocked task.
+ */
+int unwind_init_blocked(struct unwind_frame_info *info,
+                        struct task_struct *tsk)
+{
+        info->task = tsk;
+        arch_unw_init_blocked(info);
+        return 0;
+}
+EXPORT_SYMBOL(unwind_init_blocked);
+/*
+ * Prepare to unwind the currently running thread.
+ */
+int unwind_init_running(struct unwind_frame_info *info,
+                        asmlinkage int (*callback)(struct unwind_frame_info *,
+                                                   void *arg),
+                        void *arg)
+{
+        info->task = current;
+        return arch_unwind_init_running(info, callback, arg);
+}
+EXPORT_SYMBOL(unwind_init_running);
+/*
+ * Unwind until the return pointer is in user-land (or until an error
+ * occurs).  Returns 0 if successful, negative number in case of
+ * error.
+ */
+int unwind_to_user(struct unwind_frame_info *info)
+{
+        while (!arch_unw_user_mode(info)) {
+                int err = unwind(info);
+                if (err < 0)
+                        return err;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(unwind_to_user);
diff --git a/kernel/user.c b/kernel/user.c
index 2116642f42c6..6408c0424291 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -140,7 +140,7 @@ struct user_struct * alloc_uid(uid_t uid)
                atomic_set(&new->processes, 0);
                atomic_set(&new->files, 0);
                atomic_set(&new->sigpending, 0);
-#ifdef CONFIG_INOTIFY
+#ifdef CONFIG_INOTIFY_USER
                atomic_set(&new->inotify_watches, 0);
                atomic_set(&new->inotify_devs, 0);
 #endif
@@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid)
                new->mq_bytes = 0;
                new->locked_shm = 0;
-                if (alloc_uid_keyring(new) < 0) {
+                if (alloc_uid_keyring(new, current) < 0) {
                        kmem_cache_free(uid_cachep, new);
                        return NULL;
                }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 880fb415a8f6..565cf7a1febd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -428,22 +428,34 @@ int schedule_delayed_work_on(int cpu,
        return ret;
 }
-int schedule_on_each_cpu(void (*func) (void *info), void *info)
+/**
+ * schedule_on_each_cpu - call a function on each online CPU from keventd
+ * @func: the function to call
+ * @info: a pointer to pass to func()
+ *
+ * Returns zero on success.
+ * Returns -ve errno on failure.
+ *
+ * Appears to be racy against CPU hotplug.
+ *
+ * schedule_on_each_cpu() is very slow.
+ */
+int schedule_on_each_cpu(void (*func)(void *info), void *info)
 {
        int cpu;
-        struct work_struct *work;
+        struct work_struct *works;
-        work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL);
+        works = alloc_percpu(struct work_struct);
+        if (!works)
-        if (!work)
                return -ENOMEM;
        for_each_online_cpu(cpu) {
-                INIT_WORK(work + cpu, func, info);
+                INIT_WORK(per_cpu_ptr(works, cpu), func, info);
                __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
-                                work + cpu);
+                                per_cpu_ptr(works, cpu));
        }
        flush_workqueue(keventd_wq);
-        kfree(work);
+        free_percpu(works);
        return 0;
 }
@@ -531,11 +543,11 @@ int current_is_keventd(void)
 static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 {
        struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-        LIST_HEAD(list);
+        struct list_head list;
        struct work_struct *work;
        spin_lock_irq(&cwq->lock);
-        list_splice_init(&cwq->worklist, &list);
+        list_replace_init(&cwq->worklist, &list);
        while (!list_empty(&list)) {
                printk("Taking work for %s\n", wq->name);
@@ -578,6 +590,8 @@ static int workqueue_cpu_callback(struct notifier_block *nfb,
        case CPU_UP_CANCELED:
                list_for_each_entry(wq, &workqueues, list) {
+                        if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread)
+                                continue;
                        /* Unbind so it can run. */
                        kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
                                     any_online_cpu(cpu_online_map));