27 files changed, 1132 insertions, 736 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index dc12db8600e7..70d0d88e5554 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -118,7 +118,7 @@ static int check_free_space(struct file *file)
        spin_unlock(&acct_globals.lock);
        /* May block */
-        if (vfs_statfs(file->f_dentry, &sbuf))
+        if (vfs_statfs(file->f_path.dentry, &sbuf))
                return res;
        suspend = sbuf.f_blocks * SUSPEND;
        resume = sbuf.f_blocks * RESUME;
@@ -194,7 +194,7 @@ static void acct_file_reopen(struct file *file)
                add_timer(&acct_globals.timer);
        }
        if (old_acct) {
-                mnt_unpin(old_acct->f_vfsmnt);
+                mnt_unpin(old_acct->f_path.mnt);
                spin_unlock(&acct_globals.lock);
                do_acct_process(old_acct);
                filp_close(old_acct, NULL);
@@ -212,7 +212,7 @@ static int acct_on(char *name)
        if (IS_ERR(file))
                return PTR_ERR(file);
-        if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+        if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
                filp_close(file, NULL);
                return -EACCES;
        }
@@ -229,11 +229,11 @@ static int acct_on(char *name)
        }
        spin_lock(&acct_globals.lock);
-        mnt_pin(file->f_vfsmnt);
+        mnt_pin(file->f_path.mnt);
        acct_file_reopen(file);
        spin_unlock(&acct_globals.lock);
-        mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */
+        mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
        return 0;
 }
@@ -283,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name)
 void acct_auto_close_mnt(struct vfsmount *m)
 {
        spin_lock(&acct_globals.lock);
-        if (acct_globals.file && acct_globals.file->f_vfsmnt == m)
+        if (acct_globals.file && acct_globals.file->f_path.mnt == m)
                acct_file_reopen(NULL);
        spin_unlock(&acct_globals.lock);
 }
@@ -299,7 +299,7 @@ void acct_auto_close(struct super_block *sb)
 {
        spin_lock(&acct_globals.lock);
        if (acct_globals.file &&
-            acct_globals.file->f_vfsmnt->mnt_sb == sb) {
+            acct_globals.file->f_path.mnt->mnt_sb == sb) {
                acct_file_reopen(NULL);
        }
        spin_unlock(&acct_globals.lock);
@@ -428,6 +428,7 @@ static void do_acct_process(struct file *file)
        u64 elapsed;
        u64 run_time;
        struct timespec uptime;
+        struct tty_struct *tty;
        /*
         * First check to see if there is enough free_space to continue
@@ -484,16 +485,9 @@ static void do_acct_process(struct file *file)
        ac.ac_ppid = current->parent->tgid;
 #endif
-        mutex_lock(&tty_mutex);
-        /* FIXME: Whoever is responsible for current->signal locking needs
-           to use the same locking all over the kernel and document it */
-        read_lock(&tasklist_lock);
-        ac.ac_tty = current->signal->tty ?
-                old_encode_dev(tty_devnum(current->signal->tty)) : 0;
-        read_unlock(&tasklist_lock);
-        mutex_unlock(&tty_mutex);
        spin_lock_irq(&current->sighand->siglock);
+        tty = current->signal->tty;
+        ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
        ac.ac_flag = pacct->ac_flag;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 40722e26de98..298897559ca4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -781,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
                        if ((vma->vm_flags & VM_EXECUTABLE) &&
                            vma->vm_file) {
                                audit_log_d_path(ab, "exe=",
-                                                 vma->vm_file->f_dentry,
+                                                 vma->vm_file->f_path.dentry,
-                                                 vma->vm_file->f_vfsmnt);
+                                                 vma->vm_file->f_path.mnt);
                                break;
                        }
                        vma = vma->vm_next;
@@ -826,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                                 context->return_code);
        mutex_lock(&tty_mutex);
+        read_lock(&tasklist_lock);
        if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
                tty = tsk->signal->tty->name;
        else
                tty = "(none)";
+        read_unlock(&tasklist_lock);
        audit_log_format(ab,
                  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
                  " ppid=%d pid=%d auid=%u uid=%u gid=%u"
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0a6b4d89f9a0..232aed2b10f9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = {
 *
 *
 * When reading/writing to a file:
- *      - the cpuset to use in file->f_dentry->d_parent->d_fsdata
+ *      - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
- *      - the 'cftype' of the file is file->f_dentry->d_fsdata
+ *      - the 'cftype' of the file is file->f_path.dentry->d_fsdata
 */
 struct cftype {
@@ -1284,8 +1284,8 @@ static ssize_t cpuset_common_file_write(struct file *file,
                                        const char __user *userbuf,
                                        size_t nbytes, loff_t *unused_ppos)
 {
-        struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+        struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
-        struct cftype *cft = __d_cft(file->f_dentry);
+        struct cftype *cft = __d_cft(file->f_path.dentry);
        cpuset_filetype_t type = cft->private;
        char *buffer;
        char *pathbuf = NULL;
@@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
                                                size_t nbytes, loff_t *ppos)
 {
        ssize_t retval = 0;
-        struct cftype *cft = __d_cft(file->f_dentry);
+        struct cftype *cft = __d_cft(file->f_path.dentry);
        if (!cft)
                return -ENODEV;
@@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
                                size_t nbytes, loff_t *ppos)
 {
-        struct cftype *cft = __d_cft(file->f_dentry);
+        struct cftype *cft = __d_cft(file->f_path.dentry);
-        struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+        struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
        cpuset_filetype_t type = cft->private;
        char *page;
        ssize_t retval = 0;
@@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt
                                                                loff_t *ppos)
 {
        ssize_t retval = 0;
-        struct cftype *cft = __d_cft(file->f_dentry);
+        struct cftype *cft = __d_cft(file->f_path.dentry);
        if (!cft)
                return -ENODEV;
@@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
        if (err)
                return err;
-        cft = __d_cft(file->f_dentry);
+        cft = __d_cft(file->f_path.dentry);
        if (!cft)
                return -ENODEV;
        if (cft->open)
@@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
 static int cpuset_file_release(struct inode *inode, struct file *file)
 {
-        struct cftype *cft = __d_cft(file->f_dentry);
+        struct cftype *cft = __d_cft(file->f_path.dentry);
        if (cft->release)
                return cft->release(inode, file);
        return 0;
@@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
 */
 static int cpuset_tasks_open(struct inode *unused, struct file *file)
 {
-        struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+        struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
        struct ctr_struct *ctr;
        pid_t *pidarray;
        int npids;
@@ -2342,32 +2342,48 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 }
 /**
- * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
+ * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
 * @z: is this zone on an allowed node?
- * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
+ * @gfp_mask: memory allocation flags
 *
- * If we're in interrupt, yes, we can always allocate.  If zone
+ * If we're in interrupt, yes, we can always allocate.  If
+ * __GFP_THISNODE is set, yes, we can always allocate.  If zone
 * z's node is in our tasks mems_allowed, yes.  If it's not a
 * __GFP_HARDWALL request and this zone's nodes is in the nearest
 * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
 * Otherwise, no.
 *
+ * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
+ * reduces to cpuset_zone_allowed_hardwall().  Otherwise,
+ * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
+ * from an enclosing cpuset.
+ *
+ * cpuset_zone_allowed_hardwall() only handles the simpler case of
+ * hardwall cpusets, and never sleeps.
+ *
+ * The __GFP_THISNODE placement logic is really handled elsewhere,
+ * by forcibly using a zonelist starting at a specified node, and by
+ * (in get_page_from_freelist()) refusing to consider the zones for
+ * any node on the zonelist except the first.  By the time any such
+ * calls get to this routine, we should just shut up and say 'yes'.
+ *
 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
 * and do not allow allocations outside the current tasks cpuset.
 * GFP_KERNEL allocations are not so marked, so can escape to the
- * nearest mem_exclusive ancestor cpuset.
+ * nearest enclosing mem_exclusive ancestor cpuset.
 *
- * Scanning up parent cpusets requires callback_mutex.  The __alloc_pages()
+ * Scanning up parent cpusets requires callback_mutex.  The
- * routine only calls here with __GFP_HARDWALL bit _not_ set if
+ * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
- * it's a GFP_KERNEL allocation, and all nodes in the current tasks
+ * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
- * mems_allowed came up empty on the first pass over the zonelist.
+ * current tasks mems_allowed came up empty on the first pass over
- * So only GFP_KERNEL allocations, if all nodes in the cpuset are
+ * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
- * short of memory, might require taking the callback_mutex mutex.
+ * cpuset are short of memory, might require taking the callback_mutex
+ * mutex.
 *
 * The first call here from mm/page_alloc:get_page_from_freelist()
- * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
- * no allocation on a node outside the cpuset is allowed (unless in
+ * so no allocation on a node outside the cpuset is allowed (unless
- * interrupt, of course).
+ * in interrupt, of course).
 *
 * The second pass through get_page_from_freelist() doesn't even call
 * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
@@ -2380,12 +2396,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
 *
 * Rule:
- *    Don't call cpuset_zone_allowed() if you can't sleep, unless you
+ *    Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
 *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
 *    the code that might scan up ancestor cpusets and sleep.
- **/
+ */
-int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 {
        int node;                       /* node that zone z is on */
        const struct cpuset *cs;        /* current cpuset ancestors */
@@ -2415,6 +2431,40 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
        return allowed;
 }
+/*
+ * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags
+ *
+ * If we're in interrupt, yes, we can always allocate.
+ * If __GFP_THISNODE is set, yes, we can always allocate.  If zone
+ * z's node is in our tasks mems_allowed, yes.   Otherwise, no.
+ *
+ * The __GFP_THISNODE placement logic is really handled elsewhere,
+ * by forcibly using a zonelist starting at a specified node, and by
+ * (in get_page_from_freelist()) refusing to consider the zones for
+ * any node on the zonelist except the first.  By the time any such
+ * calls get to this routine, we should just shut up and say 'yes'.
+ *
+ * Unlike the cpuset_zone_allowed_softwall() variant, above,
+ * this variant requires that the zone be in the current tasks
+ * mems_allowed or that we're in interrupt.  It does not scan up the
+ * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
+ * It never sleeps.
+ */
+int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+{
+        int node;                       /* node that zone z is on */
+        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
+                return 1;
+        node = zone_to_nid(z);
+        if (node_isset(node, current->mems_allowed))
+                return 1;
+        return 0;
+}
 /**
 * cpuset_lock - lock out any changes to cpuset structures
 *
diff --git a/kernel/exit.c b/kernel/exit.c
index 4e3f919edc48..122fadb972fc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,7 +13,7 @@
 #include <linux/completion.h>
 #include <linux/personality.h>
 #include <linux/tty.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
 #include <linux/key.h>
 #include <linux/security.h>
 #include <linux/cpu.h>
@@ -22,6 +22,7 @@
 #include <linux/file.h>
 #include <linux/binfmts.h>
 #include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/profile.h>
 #include <linux/mount.h>
@@ -48,7 +49,6 @@
 #include <asm/mmu_context.h>
 extern void sem_exit (void);
-extern struct task_struct *child_reaper;
 static void exit_mm(struct task_struct * tsk);
@@ -189,21 +189,18 @@ repeat:
 int session_of_pgrp(int pgrp)
 {
        struct task_struct *p;
-        int sid = -1;
+        int sid = 0;
        read_lock(&tasklist_lock);
-        do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
-                if (p->signal->session > 0) {
+        p = find_task_by_pid_type(PIDTYPE_PGID, pgrp);
-                        sid = p->signal->session;
+        if (p == NULL)
-                        goto out;
+                p = find_task_by_pid(pgrp);
-                }
+        if (p != NULL)
-        } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+                sid = process_session(p);
-        p = find_task_by_pid(pgrp);
-        if (p)
-                sid = p->signal->session;
-out:
        read_unlock(&tasklist_lock);
-        
        return sid;
 }
@@ -225,8 +222,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
                                || p->exit_state
                                || is_init(p->real_parent))
                        continue;
-                if (process_group(p->real_parent) != pgrp
+                if (process_group(p->real_parent) != pgrp &&
-                            && p->real_parent->signal->session == p->signal->session) {
+                    process_session(p->real_parent) == process_session(p)) {
                        ret = 0;
                        break;
                }
@@ -260,7 +257,8 @@ static int has_stopped_jobs(int pgrp)
 }
 /**
- * reparent_to_init - Reparent the calling kernel thread to the init task.
+ * reparent_to_init - Reparent the calling kernel thread to the init task
+ * of the pid space that the thread belongs to.
 *
 * If a kernel thread is launched as a result of a system call, or if
 * it ever exits, it should generally reparent itself to init so that
@@ -278,8 +276,8 @@ static void reparent_to_init(void)
        ptrace_unlink(current);
        /* Reparent to init */
        remove_parent(current);
-        current->parent = child_reaper;
+        current->parent = child_reaper(current);
-        current->real_parent = child_reaper;
+        current->real_parent = child_reaper(current);
        add_parent(current);
        /* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -302,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp)
 {
        struct task_struct *curr = current->group_leader;
-        if (curr->signal->session != session) {
+        if (process_session(curr) != session) {
                detach_pid(curr, PIDTYPE_SID);
-                curr->signal->session = session;
+                set_signal_session(curr->signal, session);
                attach_pid(curr, PIDTYPE_SID, session);
        }
        if (process_group(curr) != pgrp) {
@@ -314,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp)
        }
 }
-void set_special_pids(pid_t session, pid_t pgrp)
+static void set_special_pids(pid_t session, pid_t pgrp)
 {
        write_lock_irq(&tasklist_lock);
        __set_special_pids(session, pgrp);
@@ -384,9 +382,7 @@ void daemonize(const char *name, ...)
        exit_mm(current);
        set_special_pids(1, 1);
-        mutex_lock(&tty_mutex);
+        proc_clear_tty(current);
-        current->signal->tty = NULL;
-        mutex_unlock(&tty_mutex);
        /* Block and flush all signals */
        sigfillset(&blocked);
@@ -429,7 +425,7 @@ static void close_files(struct files_struct * files)
        for (;;) {
                unsigned long set;
                i = j * __NFDBITS;
-                if (i >= fdt->max_fdset || i >= fdt->max_fds)
+                if (i >= fdt->max_fds)
                        break;
                set = fdt->open_fds->fds_bits[j++];
                while (set) {
@@ -470,11 +466,9 @@ void fastcall put_files_struct(struct files_struct *files)
                 * you can free files immediately.
                 */
                fdt = files_fdtable(files);
-                if (fdt == &files->fdtab)
+                if (fdt != &files->fdtab)
-                        fdt->free_files = files;
-                else
                        kmem_cache_free(files_cachep, files);
-                free_fdtable(fdt);
+                call_rcu(&fdt->rcu, free_fdtable_rcu);
        }
 }
@@ -649,10 +643,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
         * outside, so the child pgrp is now orphaned.
         */
        if ((process_group(p) != process_group(father)) &&
-            (p->signal->session == father->signal->session)) {
+            (process_session(p) == process_session(father))) {
                int pgrp = process_group(p);
-                if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
+                if (will_become_orphaned_pgrp(pgrp, NULL) &&
+                    has_stopped_jobs(pgrp)) {
                        __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
                        __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
                }
@@ -663,7 +658,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 * When we die, we re-parent all our children.
 * Try to give them to another thread in our thread
 * group, and if no such member exists, give it to
- * the global child reaper process (ie "init")
+ * the child reaper process (ie "init") in our pid
+ * space.
 */
 static void
 forget_original_parent(struct task_struct *father, struct list_head *to_release)
@@ -674,7 +670,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
        do {
                reaper = next_thread(reaper);
                if (reaper == father) {
-                        reaper = child_reaper;
+                        reaper = child_reaper(father);
                        break;
                }
        } while (reaper->exit_state);
@@ -786,7 +782,7 @@ static void exit_notify(struct task_struct *tsk)
        t = tsk->real_parent;
        
        if ((process_group(t) != process_group(tsk)) &&
-            (t->signal->session == tsk->signal->session) &&
+            (process_session(t) == process_session(tsk)) &&
            will_become_orphaned_pgrp(process_group(tsk), tsk) &&
            has_stopped_jobs(process_group(tsk))) {
                __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
@@ -860,8 +856,13 @@ fastcall NORET_TYPE void do_exit(long code)
                panic("Aiee, killing interrupt handler!");
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
-        if (unlikely(tsk == child_reaper))
+        if (unlikely(tsk == child_reaper(tsk))) {
-                panic("Attempted to kill init!");
+                if (tsk->nsproxy->pid_ns != &init_pid_ns)
+                        tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper;
+                else
+                        panic("Attempted to kill init!");
+        }
        if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
                current->ptrace_message = code;
diff --git a/kernel/fork.c b/kernel/fork.c
index 7f2e31ba33af..fc723e595cd5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -18,7 +18,7 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/completion.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
 #include <linux/personality.h>
 #include <linux/mempolicy.h>
 #include <linux/sem.h>
@@ -36,6 +36,7 @@
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
 #include <linux/futex.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
 #include <linux/ptrace.h>
 #include <linux/mount.h>
@@ -202,7 +203,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        struct mempolicy *pol;
        down_write(&oldmm->mmap_sem);
-        flush_cache_mm(oldmm);
+        flush_cache_dup_mm(oldmm);
        /*
         * Not linked in yet - no deadlock potential:
         */
@@ -252,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                anon_vma_link(tmp);
                file = tmp->vm_file;
                if (file) {
-                        struct inode *inode = file->f_dentry->d_inode;
+                        struct inode *inode = file->f_path.dentry->d_inode;
                        get_file(file);
                        if (tmp->vm_flags & VM_DENYWRITE)
                                atomic_dec(&inode->i_writecount);
@@ -613,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 static int count_open_files(struct fdtable *fdt)
 {
-        int size = fdt->max_fdset;
+        int size = fdt->max_fds;
        int i;
        /* Find the last open fd */
@@ -640,12 +641,10 @@ static struct files_struct *alloc_files(void)
        newf->next_fd = 0;
        fdt = &newf->fdtab;
        fdt->max_fds = NR_OPEN_DEFAULT;
-        fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
        fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
        fdt->open_fds = (fd_set *)&newf->open_fds_init;
        fdt->fd = &newf->fd_array[0];
        INIT_RCU_HEAD(&fdt->rcu);
-        fdt->free_files = NULL;
        fdt->next = NULL;
        rcu_assign_pointer(newf->fdt, fdt);
 out:
@@ -661,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 {
        struct files_struct *newf;
        struct file **old_fds, **new_fds;
-        int open_files, size, i, expand;
+        int open_files, size, i;
        struct fdtable *old_fdt, *new_fdt;
        *errorp = -ENOMEM;
@@ -672,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
        spin_lock(&oldf->file_lock);
        old_fdt = files_fdtable(oldf);
        new_fdt = files_fdtable(newf);
-        size = old_fdt->max_fdset;
        open_files = count_open_files(old_fdt);
-        expand = 0;
        /*
-         * Check whether we need to allocate a larger fd array or fd set.
+         * Check whether we need to allocate a larger fd array and fd set.
-         * Note: we're not a clone task, so the open count won't  change.
+         * Note: we're not a clone task, so the open count won't change.
         */
-        if (open_files > new_fdt->max_fdset) {
-                new_fdt->max_fdset = 0;
-                expand = 1;
-        }
        if (open_files > new_fdt->max_fds) {
                new_fdt->max_fds = 0;
-                expand = 1;
-        }
-        /* if the old fdset gets grown now, we'll only copy up to "size" fds */
-        if (expand) {
                spin_unlock(&oldf->file_lock);
                spin_lock(&newf->file_lock);
                *errorp = expand_files(newf, open_files-1);
@@ -710,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
        old_fds = old_fdt->fd;
        new_fds = new_fdt->fd;
-        memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
+        memcpy(new_fdt->open_fds->fds_bits,
-        memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
+                old_fdt->open_fds->fds_bits, open_files/8);
+        memcpy(new_fdt->close_on_exec->fds_bits,
+                old_fdt->close_on_exec->fds_bits, open_files/8);
        for (i = open_files; i != 0; i--) {
                struct file *f = *old_fds++;
@@ -736,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
        /* This is long word aligned thus could use a optimized version */ 
        memset(new_fds, 0, size); 
-        if (new_fdt->max_fdset > open_files) {
+        if (new_fdt->max_fds > open_files) {
-                int left = (new_fdt->max_fdset-open_files)/8;
+                int left = (new_fdt->max_fds-open_files)/8;
                int start = open_files / (8 * sizeof(unsigned long));
                memset(&new_fdt->open_fds->fds_bits[start], 0, left);
                memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
        }
-out:
        return newf;
 out_release:
-        free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
-        free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
-        free_fd_array(new_fdt->fd, new_fdt->max_fds);
        kmem_cache_free(files_cachep, newf);
+out:
        return NULL;
 }
@@ -1055,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->wchar = 0;           /* I/O counter: bytes written */
        p->syscr = 0;           /* I/O counter: read syscalls */
        p->syscw = 0;           /* I/O counter: write syscalls */
+        task_io_accounting_init(p);
        acct_clear_integrals(p);
        p->it_virt_expires = cputime_zero;
@@ -1259,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                if (thread_group_leader(p)) {
                        p->signal->tty = current->signal->tty;
                        p->signal->pgrp = process_group(current);
-                        p->signal->session = current->signal->session;
+                        set_signal_session(p->signal, process_session(current));
                        attach_pid(p, PIDTYPE_PGID, process_group(p));
-                        attach_pid(p, PIDTYPE_SID, p->signal->session);
+                        attach_pid(p, PIDTYPE_SID, process_session(p));
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        __get_cpu_var(process_counts)++;
@@ -1525,17 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 }
 /*
- * Unshare the namespace structure if it is being shared
+ * Unshare the mnt_namespace structure if it is being shared
 */
-static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
+static int unshare_mnt_namespace(unsigned long unshare_flags,
+                struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
 {
-        struct namespace *ns = current->nsproxy->namespace;
+        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        if ((unshare_flags & CLONE_NEWNS) && ns) {
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
-                *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
+                *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
                if (!*new_nsp)
                        return -ENOMEM;
        }
@@ -1544,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new
 }
 /*
- * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
+ * Unsharing of sighand is not supported yet
- * supported yet
 */
 static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
 {
        struct sighand_struct *sigh = current->sighand;
-        if ((unshare_flags & CLONE_SIGHAND) &&
+        if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
-            (sigh && atomic_read(&sigh->count) > 1))
                return -EINVAL;
        else
                return 0;
@@ -1625,8 +1612,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 {
        int err = 0;
        struct fs_struct *fs, *new_fs = NULL;
-        struct namespace *ns, *new_ns = NULL;
+        struct mnt_namespace *ns, *new_ns = NULL;
-        struct sighand_struct *sigh, *new_sigh = NULL;
+        struct sighand_struct *new_sigh = NULL;
        struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
        struct files_struct *fd, *new_fd = NULL;
        struct sem_undo_list *new_ulist = NULL;
@@ -1647,7 +1634,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                goto bad_unshare_out;
        if ((err = unshare_fs(unshare_flags, &new_fs)))
                goto bad_unshare_cleanup_thread;
-        if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
+        if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
                goto bad_unshare_cleanup_fs;
        if ((err = unshare_sighand(unshare_flags, &new_sigh)))
                goto bad_unshare_cleanup_ns;
@@ -1671,7 +1658,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                }
        }
-        if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
+        if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
                                new_uts || new_ipc) {
                task_lock(current);
@@ -1688,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                }
                if (new_ns) {
-                        ns = current->nsproxy->namespace;
+                        ns = current->nsproxy->mnt_ns;
-                        current->nsproxy->namespace = new_ns;
+                        current->nsproxy->mnt_ns = new_ns;
                        new_ns = ns;
                }
-                if (new_sigh) {
-                        sigh = current->sighand;
-                        rcu_assign_pointer(current->sighand, new_sigh);
-                        new_sigh = sigh;
-                }
                if (new_mm) {
                        mm = current->mm;
                        active_mm = current->active_mm;
@@ -1756,7 +1737,7 @@ bad_unshare_cleanup_sigh:
 bad_unshare_cleanup_ns:
        if (new_ns)
-                put_namespace(new_ns);
+                put_mnt_ns(new_ns);
 bad_unshare_cleanup_fs:
        if (new_fs)
diff --git a/kernel/futex.c b/kernel/futex.c
index 95989a3b4168..5a737de857d3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
 /*
 * Get parameters which are the keys for a futex.
 *
- * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
+ * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 * We can usually work out the index without swapping in the page.
 *
@@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
        /*
         * Linear file mappings are also simple.
         */
-        key->shared.inode = vma->vm_file->f_dentry->d_inode;
+        key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
        key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
        if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
                key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
@@ -1528,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal)
                goto out;
        }
        filp->f_op = &futex_fops;
-        filp->f_vfsmnt = mntget(futex_mnt);
+        filp->f_path.mnt = mntget(futex_mnt);
-        filp->f_dentry = dget(futex_mnt->mnt_root);
+        filp->f_path.dentry = dget(futex_mnt->mnt_root);
-        filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+        filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
        if (signal) {
                err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9a352667007c..61f5c717a8f5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
        unsigned int irq = (int)(long)data, full_count = count, err;
        cpumask_t new_value, tmp;
-        if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
+        if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
+                                CHECK_IRQ_PER_CPU(irq_desc[irq].status))
                return -EIO;
        err = cpumask_parse_user(buffer, count, new_value);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index ab63cfc42992..6f294ff4f9ee 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -31,14 +31,14 @@
 #endif
 /* These will be re-linked against their real values during the second link stage */
-extern unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
-extern unsigned long kallsyms_num_syms __attribute__((weak,section("data")));
+extern const unsigned long kallsyms_num_syms __attribute__((weak));
-extern u8 kallsyms_names[] __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
-extern u8 kallsyms_token_table[] __attribute__((weak));
+extern const u8 kallsyms_token_table[] __attribute__((weak));
-extern u16 kallsyms_token_index[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
-extern unsigned long kallsyms_markers[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
 static inline int is_kernel_inittext(unsigned long addr)
 {
@@ -84,7 +84,7 @@ static int is_ksym_addr(unsigned long addr)
 static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
 {
        int len, skipped_first = 0;
-        u8 *tptr, *data;
+        const u8 *tptr, *data;
        /* get the compressed symbol length from the first symbol byte */
        data = &kallsyms_names[off];
@@ -132,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
 * kallsyms array */
 static unsigned int get_symbol_offset(unsigned long pos)
 {
-        u8 *name;
+        const u8 *name;
        int i;
        /* use the closest marker we have. We have markers every 256 positions,
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8d2bea09a4ec..3a7379aa31ca 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,7 +25,7 @@
 #include <linux/kmod.h>
 #include <linux/smp_lock.h>
 #include <linux/slab.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
 #include <linux/completion.h>
 #include <linux/file.h>
 #include <linux/workqueue.h>
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b02032476dc2..01e750559034 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,13 +43,49 @@
 #include "lockdep_internals.h"
 /*
- * hash_lock: protects the lockdep hashes and class/list/hash allocators.
+ * lockdep_lock: protects the lockdep graph, the hashes and the
+ *               class/list/hash allocators.
 *
 * This is one of the rare exceptions where it's justified
 * to use a raw spinlock - we really dont want the spinlock
- * code to recurse back into the lockdep code.
+ * code to recurse back into the lockdep code...
 */
-static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+static int graph_lock(void)
+{
+        __raw_spin_lock(&lockdep_lock);
+        /*
+         * Make sure that if another CPU detected a bug while
+         * walking the graph we dont change it (while the other
+         * CPU is busy printing out stuff with the graph lock
+         * dropped already)
+         */
+        if (!debug_locks) {
+                __raw_spin_unlock(&lockdep_lock);
+                return 0;
+        }
+        return 1;
+}
+static inline int graph_unlock(void)
+{
+        __raw_spin_unlock(&lockdep_lock);
+        return 0;
+}
+/*
+ * Turn lock debugging off and return with 0 if it was off already,
+ * and also release the graph lock:
+ */
+static inline int debug_locks_off_graph_unlock(void)
+{
+        int ret = debug_locks_off();
+        __raw_spin_unlock(&lockdep_lock);
+        return ret;
+}
 static int lockdep_initialized;
@@ -57,14 +93,15 @@ unsigned long nr_list_entries;
 static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 /*
- * Allocate a lockdep entry. (assumes hash_lock held, returns
+ * Allocate a lockdep entry. (assumes the graph_lock held, returns
 * with NULL on failure)
 */
 static struct lock_list *alloc_list_entry(void)
 {
        if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
-                __raw_spin_unlock(&hash_lock);
+                if (!debug_locks_off_graph_unlock())
-                debug_locks_off();
+                        return NULL;
                printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
                printk("turning off the locking correctness validator.\n");
                return NULL;
@@ -145,9 +182,7 @@ EXPORT_SYMBOL(lockdep_on);
 */
 #define VERBOSE                 0
-#ifdef VERBOSE
+#define VERY_VERBOSE            0
-# define VERY_VERBOSE           0
-#endif
 #if VERBOSE
 # define HARDIRQ_VERBOSE        1
@@ -172,8 +207,8 @@ static int class_filter(struct lock_class *class)
                        !strcmp(class->name, "&struct->lockfield"))
                return 1;
 #endif
-        /* Allow everything else. 0 would be filter everything else */
+        /* Filter everything else. 1 would be to allow everything else */
-        return 1;
+        return 0;
 }
 #endif
@@ -207,7 +242,7 @@ static int softirq_verbose(struct lock_class *class)
 /*
 * Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the hash_lock.
+ * addresses. Protected by the graph_lock.
 */
 unsigned long nr_stack_trace_entries;
 static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
@@ -226,18 +261,15 @@ static int save_trace(struct stack_trace *trace)
        trace->max_entries = trace->nr_entries;
        nr_stack_trace_entries += trace->nr_entries;
-        if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) {
-                __raw_spin_unlock(&hash_lock);
-                return 0;
-        }
        if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
-                __raw_spin_unlock(&hash_lock);
+                if (!debug_locks_off_graph_unlock())
-                if (debug_locks_off()) {
+                        return 0;
-                        printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
-                        printk("turning off the locking correctness validator.\n");
+                printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
-                        dump_stack();
+                printk("turning off the locking correctness validator.\n");
-                }
+                dump_stack();
                return 0;
        }
@@ -526,9 +558,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
 {
        struct task_struct *curr = current;
-        __raw_spin_unlock(&hash_lock);
+        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-        debug_locks_off();
-        if (debug_locks_silent)
                return 0;
        printk("\n=======================================================\n");
@@ -556,12 +586,10 @@ static noinline int print_circular_bug_tail(void)
        if (debug_locks_silent)
                return 0;
-        /* hash_lock unlocked by the header */
-        __raw_spin_lock(&hash_lock);
        this.class = check_source->class;
        if (!save_trace(&this.trace))
                return 0;
-        __raw_spin_unlock(&hash_lock);
        print_circular_bug_entry(&this, 0);
        printk("\nother info that might help us debug this:\n\n");
@@ -577,8 +605,10 @@ static noinline int print_circular_bug_tail(void)
 static int noinline print_infinite_recursion_bug(void)
 {
-        __raw_spin_unlock(&hash_lock);
+        if (!debug_locks_off_graph_unlock())
-        DEBUG_LOCKS_WARN_ON(1);
+                return 0;
+        WARN_ON(1);
        return 0;
 }
@@ -713,9 +743,7 @@ print_bad_irq_dependency(struct task_struct *curr,
                         enum lock_usage_bit bit2,
                         const char *irqclass)
 {
-        __raw_spin_unlock(&hash_lock);
+        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-        debug_locks_off();
-        if (debug_locks_silent)
                return 0;
        printk("\n======================================================\n");
@@ -796,9 +824,7 @@ static int
 print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
                   struct held_lock *next)
 {
-        debug_locks_off();
+        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-        __raw_spin_unlock(&hash_lock);
-        if (debug_locks_silent)
                return 0;
        printk("\n=============================================\n");
@@ -974,14 +1000,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
         * Debugging printouts:
         */
        if (verbose(prev->class) || verbose(next->class)) {
-                __raw_spin_unlock(&hash_lock);
+                graph_unlock();
                printk("\n new dependency: ");
                print_lock_name(prev->class);
                printk(" => ");
                print_lock_name(next->class);
                printk("\n");
                dump_stack();
-                __raw_spin_lock(&hash_lock);
+                return graph_lock();
        }
        return 1;
 }
@@ -1046,8 +1072,10 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
        }
        return 1;
 out_bug:
-        __raw_spin_unlock(&hash_lock);
+        if (!debug_locks_off_graph_unlock())
-        DEBUG_LOCKS_WARN_ON(1);
+                return 0;
+        WARN_ON(1);
        return 0;
 }
@@ -1201,7 +1229,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        hash_head = classhashentry(key);
        raw_local_irq_save(flags);
-        __raw_spin_lock(&hash_lock);
+        if (!graph_lock()) {
+                raw_local_irq_restore(flags);
+                return NULL;
+        }
        /*
         * We have to do the hash-walk again, to avoid races
         * with another CPU:
@@ -1214,9 +1245,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
         * the hash:
         */
        if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
-                __raw_spin_unlock(&hash_lock);
+                if (!debug_locks_off_graph_unlock()) {
+                        raw_local_irq_restore(flags);
+                        return NULL;
+                }
                raw_local_irq_restore(flags);
-                debug_locks_off();
                printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
                printk("turning off the locking correctness validator.\n");
                return NULL;
@@ -1237,18 +1271,23 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        list_add_tail_rcu(&class->hash_entry, hash_head);
        if (verbose(class)) {
-                __raw_spin_unlock(&hash_lock);
+                graph_unlock();
                raw_local_irq_restore(flags);
                printk("\nnew class %p: %s", class->key, class->name);
                if (class->name_version > 1)
                        printk("#%d", class->name_version);
                printk("\n");
                dump_stack();
                raw_local_irq_save(flags);
-                __raw_spin_lock(&hash_lock);
+                if (!graph_lock()) {
+                        raw_local_irq_restore(flags);
+                        return NULL;
+                }
        }
 out_unlock_set:
-        __raw_spin_unlock(&hash_lock);
+        graph_unlock();
        raw_local_irq_restore(flags);
        if (!subclass || force)
@@ -1264,7 +1303,7 @@ out_unlock_set:
 * add it and return 0 - in this case the new dependency chain is
 * validated. If the key is already hashed, return 1.
 */
-static inline int lookup_chain_cache(u64 chain_key)
+static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
 {
        struct list_head *hash_head = chainhashentry(chain_key);
        struct lock_chain *chain;
@@ -1278,34 +1317,32 @@ static inline int lookup_chain_cache(u64 chain_key)
                if (chain->chain_key == chain_key) {
 cache_hit:
                        debug_atomic_inc(&chain_lookup_hits);
-                        /*
+                        if (very_verbose(class))
-                         * In the debugging case, force redundant checking
+                                printk("\nhash chain already cached, key: %016Lx tail class: [%p] %s\n", chain_key, class->key, class->name);
-                         * by returning 1:
-                         */
-#ifdef CONFIG_DEBUG_LOCKDEP
-                        __raw_spin_lock(&hash_lock);
-                        return 1;
-#endif
                        return 0;
                }
        }
+        if (very_verbose(class))
+                printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", chain_key, class->key, class->name);
        /*
         * Allocate a new chain entry from the static array, and add
         * it to the hash:
         */
-        __raw_spin_lock(&hash_lock);
+        if (!graph_lock())
+                return 0;
        /*
         * We have to walk the chain again locked - to avoid duplicates:
         */
        list_for_each_entry(chain, hash_head, entry) {
                if (chain->chain_key == chain_key) {
-                        __raw_spin_unlock(&hash_lock);
+                        graph_unlock();
                        goto cache_hit;
                }
        }
        if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
-                __raw_spin_unlock(&hash_lock);
+                if (!debug_locks_off_graph_unlock())
-                debug_locks_off();
+                        return 0;
                printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
                printk("turning off the locking correctness validator.\n");
                return 0;
@@ -1381,9 +1418,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
                        struct held_lock *this, int forwards,
                        const char *irqclass)
 {
-        __raw_spin_unlock(&hash_lock);
+        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-        debug_locks_off();
-        if (debug_locks_silent)
                return 0;
        printk("\n=========================================================\n");
@@ -1453,7 +1488,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
        return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
 }
-static inline void print_irqtrace_events(struct task_struct *curr)
+void print_irqtrace_events(struct task_struct *curr)
 {
        printk("irq event stamp: %u\n", curr->irq_events);
        printk("hardirqs last  enabled at (%u): ", curr->hardirq_enable_event);
@@ -1466,19 +1501,13 @@ static inline void print_irqtrace_events(struct task_struct *curr)
        print_ip_sym(curr->softirq_disable_ip);
 }
-#else
-static inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
 #endif
 static int
 print_usage_bug(struct task_struct *curr, struct held_lock *this,
                enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
 {
-        __raw_spin_unlock(&hash_lock);
+        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-        debug_locks_off();
-        if (debug_locks_silent)
                return 0;
        printk("\n=================================\n");
@@ -1539,12 +1568,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
        if (likely(this->class->usage_mask & new_mask))
                return 1;
-        __raw_spin_lock(&hash_lock);
+        if (!graph_lock())
+                return 0;
        /*
         * Make sure we didnt race:
         */
        if (unlikely(this->class->usage_mask & new_mask)) {
-                __raw_spin_unlock(&hash_lock);
+                graph_unlock();
                return 1;
        }
@@ -1730,16 +1760,16 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
                debug_atomic_dec(&nr_unused_locks);
                break;
        default:
-                __raw_spin_unlock(&hash_lock);
+                if (!debug_locks_off_graph_unlock())
-                debug_locks_off();
+                        return 0;
                WARN_ON(1);
                return 0;
        }
-        __raw_spin_unlock(&hash_lock);
+        graph_unlock();
        /*
-         * We must printk outside of the hash_lock:
+         * We must printk outside of the graph_lock:
         */
        if (ret == 2) {
                printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
@@ -2137,9 +2167,9 @@ out_calc_hash:
         * We look up the chain_key and do the O(N^2) check and update of
         * the dependencies only if this is a new dependency chain.
         * (If lookup_chain_cache() returns with 1 it acquires
-         * hash_lock for us)
+         * graph_lock for us)
         */
-        if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) {
+        if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) {
                /*
                 * Check whether last held lock:
                 *
@@ -2170,7 +2200,7 @@ out_calc_hash:
                if (!chain_head && ret != 2)
                        if (!check_prevs_add(curr, hlock))
                                return 0;
-                __raw_spin_unlock(&hash_lock);
+                graph_unlock();
        }
        curr->lockdep_depth++;
        check_chain_key(curr);
@@ -2433,6 +2463,7 @@ EXPORT_SYMBOL_GPL(lock_release);
 void lockdep_reset(void)
 {
        unsigned long flags;
+        int i;
        raw_local_irq_save(flags);
        current->curr_chain_key = 0;
@@ -2443,6 +2474,8 @@ void lockdep_reset(void)
        nr_softirq_chains = 0;
        nr_process_chains = 0;
        debug_locks = 1;
+        for (i = 0; i < CHAINHASH_SIZE; i++)
+                INIT_LIST_HEAD(chainhash_table + i);
        raw_local_irq_restore(flags);
 }
@@ -2479,7 +2512,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
        int i;
        raw_local_irq_save(flags);
-        __raw_spin_lock(&hash_lock);
+        graph_lock();
        /*
         * Unhash all classes that were created by this module:
@@ -2493,7 +2526,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
                                zap_class(class);
        }
-        __raw_spin_unlock(&hash_lock);
+        graph_unlock();
        raw_local_irq_restore(flags);
 }
@@ -2521,20 +2554,20 @@ void lockdep_reset_lock(struct lockdep_map *lock)
         * Debug check: in the end all mapped classes should
         * be gone.
         */
-        __raw_spin_lock(&hash_lock);
+        graph_lock();
        for (i = 0; i < CLASSHASH_SIZE; i++) {
                head = classhash_table + i;
                if (list_empty(head))
                        continue;
                list_for_each_entry_safe(class, next, head, hash_entry) {
                        if (unlikely(class == lock->class_cache)) {
-                                __raw_spin_unlock(&hash_lock);
+                                if (debug_locks_off_graph_unlock())
-                                DEBUG_LOCKS_WARN_ON(1);
+                                        WARN_ON(1);
                                goto out_restore;
                        }
                }
        }
-        __raw_spin_unlock(&hash_lock);
+        graph_unlock();
 out_restore:
        raw_local_irq_restore(flags);
diff --git a/kernel/module.c b/kernel/module.c
index d9eae45d0145..b565eaeff7e6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -824,9 +824,34 @@ static inline void module_unload_init(struct module *mod)
 }
 #endif /* CONFIG_MODULE_UNLOAD */
+static ssize_t show_initstate(struct module_attribute *mattr,
+                           struct module *mod, char *buffer)
+{
+        const char *state = "unknown";
+        switch (mod->state) {
+        case MODULE_STATE_LIVE:
+                state = "live";
+                break;
+        case MODULE_STATE_COMING:
+                state = "coming";
+                break;
+        case MODULE_STATE_GOING:
+                state = "going";
+                break;
+        }
+        return sprintf(buffer, "%s\n", state);
+}
+static struct module_attribute initstate = {
+        .attr = { .name = "initstate", .mode = 0444, .owner = THIS_MODULE },
+        .show = show_initstate,
+};
 static struct module_attribute *modinfo_attrs[] = {
        &modinfo_version,
        &modinfo_srcversion,
+        &initstate,
 #ifdef CONFIG_MODULE_UNLOAD
        &refcnt,
 #endif
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 8c71cf72a497..e7cbbb82765b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 }
 EXPORT_SYMBOL_GPL(mutex_lock_nested);
+int __sched
+mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
+{
+        might_sleep();
+        return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
 #endif
 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 674aceb7335a..f5b9ee6f6bbb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -17,8 +17,9 @@
 #include <linux/version.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
 #include <linux/utsname.h>
+#include <linux/pid_namespace.h>
 struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
@@ -60,12 +61,14 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
        struct nsproxy *ns = clone_namespaces(orig);
        if (ns) {
-                if (ns->namespace)
+                if (ns->mnt_ns)
-                        get_namespace(ns->namespace);
+                        get_mnt_ns(ns->mnt_ns);
                if (ns->uts_ns)
                        get_uts_ns(ns->uts_ns);
                if (ns->ipc_ns)
                        get_ipc_ns(ns->ipc_ns);
+                if (ns->pid_ns)
+                        get_pid_ns(ns->pid_ns);
        }
        return ns;
@@ -97,7 +100,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
        tsk->nsproxy = new_ns;
-        err = copy_namespace(flags, tsk);
+        err = copy_mnt_ns(flags, tsk);
        if (err)
                goto out_ns;
@@ -109,16 +112,23 @@ int copy_namespaces(int flags, struct task_struct *tsk)
        if (err)
                goto out_ipc;
+        err = copy_pid_ns(flags, tsk);
+        if (err)
+                goto out_pid;
 out:
        put_nsproxy(old_ns);
        return err;
+out_pid:
+        if (new_ns->ipc_ns)
+                put_ipc_ns(new_ns->ipc_ns);
 out_ipc:
        if (new_ns->uts_ns)
                put_uts_ns(new_ns->uts_ns);
 out_uts:
-        if (new_ns->namespace)
+        if (new_ns->mnt_ns)
-                put_namespace(new_ns->namespace);
+                put_mnt_ns(new_ns->mnt_ns);
 out_ns:
        tsk->nsproxy = old_ns;
        kfree(new_ns);
@@ -127,11 +137,13 @@ out_ns:
 void free_nsproxy(struct nsproxy *ns)
 {
-                if (ns->namespace)
+        if (ns->mnt_ns)
-                        put_namespace(ns->namespace);
+                put_mnt_ns(ns->mnt_ns);
-                if (ns->uts_ns)
+        if (ns->uts_ns)
-                        put_uts_ns(ns->uts_ns);
+                put_uts_ns(ns->uts_ns);
-                if (ns->ipc_ns)
+        if (ns->ipc_ns)
-                        put_ipc_ns(ns->ipc_ns);
+                put_ipc_ns(ns->ipc_ns);
-                kfree(ns);
+        if (ns->pid_ns)
+                put_pid_ns(ns->pid_ns);
+        kfree(ns);
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index a48879b0b921..2efe9d8d367b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -26,7 +26,7 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/hash.h>
-#include <linux/pspace.h>
+#include <linux/pid_namespace.h>
 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
 static struct hlist_head *pid_hash;
@@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT;
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
 #define BITS_PER_PAGE_MASK      (BITS_PER_PAGE-1)
-static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
+static inline int mk_pid(struct pid_namespace *pid_ns,
+                struct pidmap *map, int off)
 {
-        return (map - pspace->pidmap)*BITS_PER_PAGE + off;
+        return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
 }
 #define find_next_offset(map, off)                                      \
@@ -57,11 +58,15 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
 * value does not cause lots of bitmaps to be allocated, but
 * the scheme scales to up to 4 million PIDs, runtime.
 */
-struct pspace init_pspace = {
+struct pid_namespace init_pid_ns = {
+        .kref = {
+                .refcount       = ATOMIC_INIT(2),
+        },
        .pidmap = {
                [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
        },
-        .last_pid = 0
+        .last_pid = 0,
+        .child_reaper = &init_task
 };
 /*
@@ -80,25 +85,25 @@ struct pspace init_pspace = {
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static fastcall void free_pidmap(struct pspace *pspace, int pid)
+static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
 {
-        struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE;
+        struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
        int offset = pid & BITS_PER_PAGE_MASK;
        clear_bit(offset, map->page);
        atomic_inc(&map->nr_free);
 }
-static int alloc_pidmap(struct pspace *pspace)
+static int alloc_pidmap(struct pid_namespace *pid_ns)
 {
-        int i, offset, max_scan, pid, last = pspace->last_pid;
+        int i, offset, max_scan, pid, last = pid_ns->last_pid;
        struct pidmap *map;
        pid = last + 1;
        if (pid >= pid_max)
                pid = RESERVED_PIDS;
        offset = pid & BITS_PER_PAGE_MASK;
-        map = &pspace->pidmap[pid/BITS_PER_PAGE];
+        map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
        max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
        for (i = 0; i <= max_scan; ++i) {
                if (unlikely(!map->page)) {
@@ -120,11 +125,11 @@ static int alloc_pidmap(struct pspace *pspace)
                        do {
                                if (!test_and_set_bit(offset, map->page)) {
                                        atomic_dec(&map->nr_free);
-                                        pspace->last_pid = pid;
+                                        pid_ns->last_pid = pid;
                                        return pid;
                                }
                                offset = find_next_offset(map, offset);
-                                pid = mk_pid(pspace, map, offset);
+                                pid = mk_pid(pid_ns, map, offset);
                        /*
                         * find_next_offset() found a bit, the pid from it
                         * is in-bounds, and if we fell back to the last
@@ -135,34 +140,34 @@ static int alloc_pidmap(struct pspace *pspace)
                                        (i != max_scan || pid < last ||
                                            !((last+1) & BITS_PER_PAGE_MASK)));
                }
-                if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+                if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
                        ++map;
                        offset = 0;
                } else {
-                        map = &pspace->pidmap[0];
+                        map = &pid_ns->pidmap[0];
                        offset = RESERVED_PIDS;
                        if (unlikely(last == offset))
                                break;
                }
-                pid = mk_pid(pspace, map, offset);
+                pid = mk_pid(pid_ns, map, offset);
        }
        return -1;
 }
-static int next_pidmap(struct pspace *pspace, int last)
+static int next_pidmap(struct pid_namespace *pid_ns, int last)
 {
        int offset;
        struct pidmap *map, *end;
        offset = (last + 1) & BITS_PER_PAGE_MASK;
-        map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE];
+        map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
-        end = &pspace->pidmap[PIDMAP_ENTRIES];
+        end = &pid_ns->pidmap[PIDMAP_ENTRIES];
        for (; map < end; map++, offset = 0) {
                if (unlikely(!map->page))
                        continue;
                offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
                if (offset < BITS_PER_PAGE)
-                        return mk_pid(pspace, map, offset);
+                        return mk_pid(pid_ns, map, offset);
        }
        return -1;
 }
@@ -192,7 +197,7 @@ fastcall void free_pid(struct pid *pid)
        hlist_del_rcu(&pid->pid_chain);
        spin_unlock_irqrestore(&pidmap_lock, flags);
-        free_pidmap(&init_pspace, pid->nr);
+        free_pidmap(current->nsproxy->pid_ns, pid->nr);
        call_rcu(&pid->rcu, delayed_put_pid);
 }
@@ -206,7 +211,7 @@ struct pid *alloc_pid(void)
        if (!pid)
                goto out;
-        nr = alloc_pidmap(&init_pspace);
+        nr = alloc_pidmap(current->nsproxy->pid_ns);
        if (nr < 0)
                goto out_free;
@@ -348,13 +353,33 @@ struct pid *find_ge_pid(int nr)
                pid = find_pid(nr);
                if (pid)
                        break;
-                nr = next_pidmap(&init_pspace, nr);
+                nr = next_pidmap(current->nsproxy->pid_ns, nr);
        } while (nr > 0);
        return pid;
 }
 EXPORT_SYMBOL_GPL(find_get_pid);
+int copy_pid_ns(int flags, struct task_struct *tsk)
+{
+        struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
+        int err = 0;
+        if (!old_ns)
+                return 0;
+        get_pid_ns(old_ns);
+        return err;
+}
+void free_pid_ns(struct kref *kref)
+{
+        struct pid_namespace *ns;
+        ns = container_of(kref, struct pid_namespace, kref);
+        kfree(ns);
+}
 /*
 * The pid hash table is scaled according to the amount of memory in the
 * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -382,10 +407,10 @@ void __init pidhash_init(void)
 void __init pidmap_init(void)
 {
-        init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+        init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
        /* Reserve PID 0. We never call free_pidmap(0) */
-        set_bit(0, init_pspace.pidmap[0].page);
+        set_bit(0, init_pid_ns.pidmap[0].page);
-        atomic_dec(&init_pspace.pidmap[0].nr_free);
+        atomic_dec(&init_pid_ns.pidmap[0].nr_free);
        pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
                                        __alignof__(struct pid),
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 710ed084e7c5..ed296225dcd4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -20,13 +20,14 @@ config PM
          sending the processor to sleep and saving power.
 config PM_LEGACY
-        bool "Legacy Power Management API"
+        bool "Legacy Power Management API (DEPRECATED)"
        depends on PM
-        default y
+        default n
        ---help---
-           Support for pm_register() and friends.
+           Support for pm_register() and friends.  This old API is obsoleted
+           by the driver model.
-           If unsure, say Y.
+           If unsure, say N.
 config PM_DEBUG
        bool "Power Management Debug Support"
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0b00f56c2ad0..88fc5d7ac737 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -60,9 +60,11 @@ static void power_down(suspend_disk_method_t mode)
 {
        switch(mode) {
        case PM_DISK_PLATFORM:
-                kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
+                if (pm_ops && pm_ops->enter) {
-                pm_ops->enter(PM_SUSPEND_DISK);
+                        kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-                break;
+                        pm_ops->enter(PM_SUSPEND_DISK);
+                        break;
+                }
        case PM_DISK_SHUTDOWN:
                kernel_power_off();
                break;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 500eb87f643d..ff3a6182f5f0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -29,7 +29,7 @@
 DEFINE_MUTEX(pm_mutex);
 struct pm_ops *pm_ops;
-suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
+suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM;
 /**
 *      pm_set_ops - Set the global power method table. 
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 99eeb119b06d..6d566bf7085c 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -28,8 +28,7 @@ static inline int freezeable(struct task_struct * p)
        if ((p == current) || 
            (p->flags & PF_NOFREEZE) ||
            (p->exit_state == EXIT_ZOMBIE) ||
-            (p->exit_state == EXIT_DEAD) ||
+            (p->exit_state == EXIT_DEAD))
-            (p->state == TASK_STOPPED))
                return 0;
        return 1;
 }
@@ -61,10 +60,16 @@ static inline void freeze_process(struct task_struct *p)
        unsigned long flags;
        if (!freezing(p)) {
-                freeze(p);
+                rmb();
-                spin_lock_irqsave(&p->sighand->siglock, flags);
+                if (!frozen(p)) {
-                signal_wake_up(p, 0);
+                        if (p->state == TASK_STOPPED)
-                spin_unlock_irqrestore(&p->sighand->siglock, flags);
+                                force_sig_specific(SIGSTOP, p);
+                        freeze(p);
+                        spin_lock_irqsave(&p->sighand->siglock, flags);
+                        signal_wake_up(p, p->state == TASK_STOPPED);
+                        spin_unlock_irqrestore(&p->sighand->siglock, flags);
+                }
        }
 }
@@ -103,9 +108,7 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
                        if (frozen(p))
                                continue;
-                        if (p->state == TASK_TRACED &&
+                        if (p->state == TASK_TRACED && frozen(p->parent)) {
-                            (frozen(p->parent) ||
-                             p->parent->state == TASK_STOPPED)) {
                                cancel_freezing(p);
                                continue;
                        }
diff --git a/kernel/relay.c b/kernel/relay.c
index 75a3a9a7efc2..a4701e7ba7d0 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -138,7 +138,7 @@ depopulate:
 */
 struct rchan_buf *relay_create_buf(struct rchan *chan)
 {
-        struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
+        struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
        if (!buf)
                return NULL;
@@ -479,7 +479,7 @@ struct rchan *relay_open(const char *base_filename,
        if (!(subbuf_size && n_subbufs))
                return NULL;
-        chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
+        chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
        if (!chan)
                return NULL;
@@ -959,7 +959,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
        if (!desc->count)
                return 0;
-        mutex_lock(&filp->f_dentry->d_inode->i_mutex);
+        mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
        do {
                if (!relay_file_read_avail(buf, *ppos))
                        break;
@@ -979,7 +979,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
                        *ppos = relay_file_read_end_pos(buf, read_start, ret);
                }
        } while (desc->count && ret);
-        mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
+        mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
        return desc->written;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index f385eff4682d..5cd833bc2173 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -225,8 +225,10 @@ struct rq {
        unsigned long nr_uninterruptible;
        unsigned long expired_timestamp;
-        unsigned long long timestamp_last_tick;
+        /* Cached timestamp set by update_cpu_clock() */
+        unsigned long long most_recent_timestamp;
        struct task_struct *curr, *idle;
+        unsigned long next_balance;
        struct mm_struct *prev_mm;
        struct prio_array *active, *expired, arrays[2];
        int best_expired_prio;
@@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 * bump this up when changing the output format or the meaning of an existing
 * format, so that tools can adapt (or abort)
 */
-#define SCHEDSTAT_VERSION 12
+#define SCHEDSTAT_VERSION 14
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
                        seq_printf(seq, "domain%d %s", dcnt++, mask_str);
                        for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
                                        itype++) {
-                                seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
+                                seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
+                                                "%lu",
                                    sd->lb_cnt[itype],
                                    sd->lb_balanced[itype],
                                    sd->lb_failed[itype],
@@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
                                    sd->lb_nobusyq[itype],
                                    sd->lb_nobusyg[itype]);
                        }
-                        seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+                        seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
+                            " %lu %lu %lu\n",
                            sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
                            sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
                            sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
-                            sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
+                            sd->ttwu_wake_remote, sd->ttwu_move_affine,
+                            sd->ttwu_move_balance);
                }
                preempt_enable();
 #endif
@@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
 #endif
 /*
- * rq_lock - lock a given runqueue and disable interrupts.
+ * this_rq_lock - lock this runqueue and disable interrupts.
 */
 static inline struct rq *this_rq_lock(void)
        __acquires(rq->lock)
@@ -938,13 +943,16 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
 {
        unsigned long long now;
+        if (rt_task(p))
+                goto out;
        now = sched_clock();
 #ifdef CONFIG_SMP
        if (!local) {
                /* Compensate for drifting sched_clock */
                struct rq *this_rq = this_rq();
-                now = (now - this_rq->timestamp_last_tick)
+                now = (now - this_rq->most_recent_timestamp)
-                        + rq->timestamp_last_tick;
+                        + rq->most_recent_timestamp;
        }
 #endif
@@ -959,8 +967,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
                                     (now - p->timestamp) >> 20);
        }
-        if (!rt_task(p))
+        p->prio = recalc_task_prio(p, now);
-                p->prio = recalc_task_prio(p, now);
        /*
         * This checks to make sure it's not an uninterruptible task
@@ -985,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
                }
        }
        p->timestamp = now;
+out:
        __activate_task(p, rq);
 }
@@ -1450,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                if (this_sd->flags & SD_WAKE_AFFINE) {
                        unsigned long tl = this_load;
-                        unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+                        unsigned long tl_per_task;
+                        tl_per_task = cpu_avg_load_per_task(this_cpu);
                        /*
                         * If sync wakeup then subtract the (maximum possible)
@@ -1688,8 +1697,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 * Not the local CPU - must adjust timestamp. This should
                 * get optimised away in the !CONFIG_SMP case.
                 */
-                p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
+                p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
-                                        + rq->timestamp_last_tick;
+                                        + rq->most_recent_timestamp;
                __activate_task(p, rq);
                if (TASK_PREEMPTS_CURR(p, rq))
                        resched_task(rq->curr);
@@ -1952,6 +1961,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
        __acquires(rq1->lock)
        __acquires(rq2->lock)
 {
+        BUG_ON(!irqs_disabled());
        if (rq1 == rq2) {
                spin_lock(&rq1->lock);
                __acquire(rq2->lock);   /* Fake it out ;) */
@@ -1991,6 +2001,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
 {
+        if (unlikely(!irqs_disabled())) {
+                /* printk() doesn't work good under rq->lock */
+                spin_unlock(&this_rq->lock);
+                BUG_ON(1);
+        }
        if (unlikely(!spin_trylock(&busiest->lock))) {
                if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
@@ -2061,8 +2076,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
        set_task_cpu(p, this_cpu);
        inc_nr_running(p, this_rq);
        enqueue_task(p, this_array);
-        p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
+        p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
-                                + this_rq->timestamp_last_tick;
+                                + this_rq->most_recent_timestamp;
        /*
         * Note that idle threads have a prio of MAX_PRIO, for this test
         * to be always true for them.
@@ -2098,10 +2113,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) too many balance attempts have failed.
         */
-        if (sd->nr_balance_failed > sd->cache_nice_tries)
+        if (sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+                if (task_hot(p, rq->most_recent_timestamp, sd))
+                        schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
                return 1;
+        }
-        if (task_hot(p, rq->timestamp_last_tick, sd))
+        if (task_hot(p, rq->most_recent_timestamp, sd))
                return 0;
        return 1;
 }
@@ -2199,11 +2219,6 @@ skip_queue:
                goto skip_bitmap;
        }
-#ifdef CONFIG_SCHEDSTATS
-        if (task_hot(tmp, busiest->timestamp_last_tick, sd))
-                schedstat_inc(sd, lb_hot_gained[idle]);
-#endif
        pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
        pulled++;
        rem_load_move -= tmp->load_weight;
@@ -2241,7 +2256,7 @@ out:
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
                   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
-                   cpumask_t *cpus)
+                   cpumask_t *cpus, int *balance)
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2270,10 +2285,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                unsigned long load, group_capacity;
                int local_group;
                int i;
+                unsigned int balance_cpu = -1, first_idle_cpu = 0;
                unsigned long sum_nr_running, sum_weighted_load;
                local_group = cpu_isset(this_cpu, group->cpumask);
+                if (local_group)
+                        balance_cpu = first_cpu(group->cpumask);
                /* Tally up the load of all CPUs in the group */
                sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -2289,9 +2308,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                                *sd_idle = 0;
                        /* Bias balancing toward cpus of our domain */
-                        if (local_group)
+                        if (local_group) {
+                                if (idle_cpu(i) && !first_idle_cpu) {
+                                        first_idle_cpu = 1;
+                                        balance_cpu = i;
+                                }
                                load = target_load(i, load_idx);
-                        else
+                        } else
                                load = source_load(i, load_idx);
                        avg_load += load;
@@ -2299,6 +2323,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        sum_weighted_load += rq->raw_weighted_load;
                }
+                /*
+                 * First idle cpu or the first cpu(busiest) in this sched group
+                 * is eligible for doing load balancing at this and above
+                 * domains.
+                 */
+                if (local_group && balance_cpu != this_cpu && balance) {
+                        *balance = 0;
+                        goto ret;
+                }
                total_load += avg_load;
                total_pwr += group->cpu_power;
@@ -2458,18 +2492,21 @@ small_imbalance:
                pwr_now /= SCHED_LOAD_SCALE;
                /* Amount of load we'd subtract */
-                tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
+                tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+                        busiest->cpu_power;
                if (max_load > tmp)
                        pwr_move += busiest->cpu_power *
                                min(busiest_load_per_task, max_load - tmp);
                /* Amount of load we'd add */
-                if (max_load*busiest->cpu_power <
+                if (max_load * busiest->cpu_power <
-                                busiest_load_per_task*SCHED_LOAD_SCALE)
+                                busiest_load_per_task * SCHED_LOAD_SCALE)
-                        tmp = max_load*busiest->cpu_power/this->cpu_power;
+                        tmp = max_load * busiest->cpu_power / this->cpu_power;
                else
-                        tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
+                        tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
-                pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
+                                this->cpu_power;
+                pwr_move += this->cpu_power *
+                        min(this_load_per_task, this_load + tmp);
                pwr_move /= SCHED_LOAD_SCALE;
                /* Move if we gain throughput */
@@ -2490,8 +2527,8 @@ out_balanced:
                *imbalance = min_load_per_task;
                return group_min;
        }
-ret:
 #endif
+ret:
        *imbalance = 0;
        return NULL;
 }
@@ -2540,17 +2577,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
 /*
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
- *
- * Called with this_rq unlocked.
 */
 static int load_balance(int this_cpu, struct rq *this_rq,
-                        struct sched_domain *sd, enum idle_type idle)
+                        struct sched_domain *sd, enum idle_type idle,
+                        int *balance)
 {
        int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
        struct sched_group *group;
        unsigned long imbalance;
        struct rq *busiest;
        cpumask_t cpus = CPU_MASK_ALL;
+        unsigned long flags;
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -2566,7 +2603,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 redo:
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
-                                                        &cpus);
+                                   &cpus, balance);
+        if (*balance == 0)
+                goto out_balanced;
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[idle]);
                goto out_balanced;
@@ -2590,11 +2631,13 @@ redo:
                 * still unbalanced. nr_moved simply stays zero, so it is
                 * correctly treated as an imbalance.
                 */
+                local_irq_save(flags);
                double_rq_lock(this_rq, busiest);
                nr_moved = move_tasks(this_rq, this_cpu, busiest,
                                      minus_1_or_zero(busiest->nr_running),
                                      imbalance, sd, idle, &all_pinned);
                double_rq_unlock(this_rq, busiest);
+                local_irq_restore(flags);
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(all_pinned)) {
@@ -2611,13 +2654,13 @@ redo:
                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-                        spin_lock(&busiest->lock);
+                        spin_lock_irqsave(&busiest->lock, flags);
                        /* don't kick the migration_thread, if the curr
                         * task on busiest cpu can't be moved to this_cpu
                         */
                        if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
-                                spin_unlock(&busiest->lock);
+                                spin_unlock_irqrestore(&busiest->lock, flags);
                                all_pinned = 1;
                                goto out_one_pinned;
                        }
@@ -2627,7 +2670,7 @@ redo:
                                busiest->push_cpu = this_cpu;
                                active_balance = 1;
                        }
-                        spin_unlock(&busiest->lock);
+                        spin_unlock_irqrestore(&busiest->lock, flags);
                        if (active_balance)
                                wake_up_process(busiest->migration_thread);
@@ -2706,7 +2749,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
 redo:
        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
-                                &sd_idle, &cpus);
+                                   &sd_idle, &cpus, NULL);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
                goto out_balanced;
@@ -2766,14 +2809,28 @@ out_balanced:
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
        struct sched_domain *sd;
+        int pulled_task = 0;
+        unsigned long next_balance = jiffies + 60 *  HZ;
        for_each_domain(this_cpu, sd) {
                if (sd->flags & SD_BALANCE_NEWIDLE) {
                        /* If we've pulled tasks over stop searching: */
-                        if (load_balance_newidle(this_cpu, this_rq, sd))
+                        pulled_task = load_balance_newidle(this_cpu,
+                                                        this_rq, sd);
+                        if (time_after(next_balance,
+                                  sd->last_balance + sd->balance_interval))
+                                next_balance = sd->last_balance
+                                                + sd->balance_interval;
+                        if (pulled_task)
                                break;
                }
        }
+        if (!pulled_task)
+                /*
+                 * We are going idle. next_balance may be set based on
+                 * a busy processor. So reset next_balance.
+                 */
+                this_rq->next_balance = next_balance;
 }
 /*
@@ -2826,26 +2883,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
        spin_unlock(&target_rq->lock);
 }
-/*
+static void update_load(struct rq *this_rq)
- * rebalance_tick will get called every timer tick, on every CPU.
- *
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
- */
-/* Don't have all balancing operations going off at once: */
-static inline unsigned long cpu_offset(int cpu)
 {
-        return jiffies + cpu * HZ / NR_CPUS;
+        unsigned long this_load;
-}
-static void
-rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
-{
-        unsigned long this_load, interval, j = cpu_offset(this_cpu);
-        struct sched_domain *sd;
        int i, scale;
        this_load = this_rq->raw_weighted_load;
@@ -2865,6 +2905,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
                        new_load += scale-1;
                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
        }
+}
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ *
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+static DEFINE_SPINLOCK(balancing);
+static void run_rebalance_domains(struct softirq_action *h)
+{
+        int this_cpu = smp_processor_id(), balance = 1;
+        struct rq *this_rq = cpu_rq(this_cpu);
+        unsigned long interval;
+        struct sched_domain *sd;
+        /*
+         * We are idle if there are no processes running. This
+         * is valid even if we are the idle process (SMT).
+         */
+        enum idle_type idle = !this_rq->nr_running ?
+                                SCHED_IDLE : NOT_IDLE;
+        /* Earliest time when we have to call run_rebalance_domains again */
+        unsigned long next_balance = jiffies + 60*HZ;
        for_each_domain(this_cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2879,8 +2945,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
                if (unlikely(!interval))
                        interval = 1;
-                if (j - sd->last_balance >= interval) {
+                if (sd->flags & SD_SERIALIZE) {
-                        if (load_balance(this_cpu, this_rq, sd, idle)) {
+                        if (!spin_trylock(&balancing))
+                                goto out;
+                }
+                if (time_after_eq(jiffies, sd->last_balance + interval)) {
+                        if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
                                /*
                                 * We've pulled tasks over so either we're no
                                 * longer idle, or one of our SMT siblings is
@@ -2888,39 +2959,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
                                 */
                                idle = NOT_IDLE;
                        }
-                        sd->last_balance += interval;
+                        sd->last_balance = jiffies;
                }
+                if (sd->flags & SD_SERIALIZE)
+                        spin_unlock(&balancing);
+out:
+                if (time_after(next_balance, sd->last_balance + interval))
+                        next_balance = sd->last_balance + interval;
+                /*
+                 * Stop the load balance at this level. There is another
+                 * CPU in our sched group which is doing load balancing more
+                 * actively.
+                 */
+                if (!balance)
+                        break;
        }
+        this_rq->next_balance = next_balance;
 }
 #else
 /*
 * on UP we do not need to balance between CPUs:
 */
-static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
-{
-}
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
 #endif
-static inline int wake_priority_sleeper(struct rq *rq)
+static inline void wake_priority_sleeper(struct rq *rq)
 {
-        int ret = 0;
 #ifdef CONFIG_SCHED_SMT
+        if (!rq->nr_running)
+                return;
        spin_lock(&rq->lock);
        /*
         * If an SMT sibling task has been put to sleep for priority
         * reasons reschedule the idle task to see if it can now run.
         */
-        if (rq->nr_running) {
+        if (rq->nr_running)
                resched_task(rq->idle);
-                ret = 1;
-        }
        spin_unlock(&rq->lock);
 #endif
-        return ret;
 }
 DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -2934,7 +3014,8 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-        p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
+        p->sched_time += now - p->last_ran;
+        p->last_ran = rq->most_recent_timestamp = now;
 }
 /*
@@ -2947,8 +3028,7 @@ unsigned long long current_sched_time(const struct task_struct *p)
        unsigned long flags;
        local_irq_save(flags);
-        ns = max(p->timestamp, task_rq(p)->timestamp_last_tick);
+        ns = p->sched_time + sched_clock() - p->last_ran;
-        ns = p->sched_time + sched_clock() - ns;
        local_irq_restore(flags);
        return ns;
@@ -3048,35 +3128,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
                cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
-/*
+static void task_running_tick(struct rq *rq, struct task_struct *p)
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
- */
-void scheduler_tick(void)
 {
-        unsigned long long now = sched_clock();
-        struct task_struct *p = current;
-        int cpu = smp_processor_id();
-        struct rq *rq = cpu_rq(cpu);
-        update_cpu_clock(p, rq, now);
-        rq->timestamp_last_tick = now;
-        if (p == rq->idle) {
-                if (wake_priority_sleeper(rq))
-                        goto out;
-                rebalance_tick(cpu, rq, SCHED_IDLE);
-                return;
-        }
-        /* Task might have expired already, but not scheduled off yet */
        if (p->array != rq->active) {
+                /* Task has expired but was not scheduled yet */
                set_tsk_need_resched(p);
-                goto out;
+                return;
        }
        spin_lock(&rq->lock);
        /*
@@ -3144,8 +3201,34 @@ void scheduler_tick(void)
        }
 out_unlock:
        spin_unlock(&rq->lock);
-out:
+}
-        rebalance_tick(cpu, rq, NOT_IDLE);
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ *
+ * It also gets called by the fork code, when changing the parent's
+ * timeslices.
+ */
+void scheduler_tick(void)
+{
+        unsigned long long now = sched_clock();
+        struct task_struct *p = current;
+        int cpu = smp_processor_id();
+        struct rq *rq = cpu_rq(cpu);
+        update_cpu_clock(p, rq, now);
+        if (p == rq->idle)
+                /* Task on the idle queue */
+                wake_priority_sleeper(rq);
+        else
+                task_running_tick(rq, p);
+#ifdef CONFIG_SMP
+        update_load(rq);
+        if (time_after_eq(jiffies, rq->next_balance))
+                raise_softirq(SCHED_SOFTIRQ);
+#endif
 }
 #ifdef CONFIG_SCHED_SMT
@@ -3291,7 +3374,8 @@ void fastcall add_preempt_count(int val)
        /*
         * Spinlock count overflowing soon?
         */
-        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+                                PREEMPT_MASK - 10);
 }
 EXPORT_SYMBOL(add_preempt_count);
@@ -3345,6 +3429,8 @@ asmlinkage void __sched schedule(void)
                        "%s/0x%08x/%d\n",
                        current->comm, preempt_count(), current->pid);
                debug_show_held_locks(current);
+                if (irqs_disabled())
+                        print_irqtrace_events(current);
                dump_stack();
        }
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4990,8 +5076,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
                 * afterwards, and pretending it was a local activate.
                 * This way is cleaner and logically correct.
                 */
-                p->timestamp = p->timestamp - rq_src->timestamp_last_tick
+                p->timestamp = p->timestamp - rq_src->most_recent_timestamp
-                                + rq_dest->timestamp_last_tick;
+                                + rq_dest->most_recent_timestamp;
                deactivate_task(p, rq_src);
                __activate_task(p, rq_dest);
                if (TASK_PREEMPTS_CURR(p, rq_dest))
@@ -5067,7 +5153,10 @@ wait_to_die:
 }
 #ifdef CONFIG_HOTPLUG_CPU
-/* Figure out where task on dead CPU should go, use force if neccessary. */
+/*
+ * Figure out where task on dead CPU should go, use force if neccessary.
+ * NOTE: interrupts should be disabled by the caller
+ */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
        unsigned long flags;
@@ -5187,6 +5276,7 @@ void idle_task_exit(void)
        mmdrop(mm);
 }
+/* called under rq->lock with disabled interrupts */
 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 {
        struct rq *rq = cpu_rq(dead_cpu);
@@ -5203,10 +5293,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
         * Drop lock around migration; if someone else moves it,
         * that's OK.  No task can be added to this CPU, so iteration is
         * fine.
+         * NOTE: interrupts should be left disabled  --dev@
         */
-        spin_unlock_irq(&rq->lock);
+        spin_unlock(&rq->lock);
        move_task_off_dead_cpu(dead_cpu, p);
-        spin_lock_irq(&rq->lock);
+        spin_lock(&rq->lock);
        put_task_struct(p);
 }
@@ -5359,16 +5450,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                if (!(sd->flags & SD_LOAD_BALANCE)) {
                        printk("does not load-balance\n");
                        if (sd->parent)
-                                printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
+                                printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+                                                " has parent");
                        break;
                }
                printk("span %s\n", str);
                if (!cpu_isset(cpu, sd->span))
-                        printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
+                        printk(KERN_ERR "ERROR: domain->span does not contain "
+                                        "CPU%d\n", cpu);
                if (!cpu_isset(cpu, group->cpumask))
-                        printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
+                        printk(KERN_ERR "ERROR: domain->groups does not contain"
+                                        " CPU%d\n", cpu);
                printk(KERN_DEBUG);
                for (i = 0; i < level + 2; i++)
@@ -5383,7 +5477,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                        if (!group->cpu_power) {
                                printk("\n");
-                                printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
+                                printk(KERN_ERR "ERROR: domain->cpu_power not "
+                                                "set\n");
                        }
                        if (!cpus_weight(group->cpumask)) {
@@ -5406,15 +5501,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                printk("\n");
                if (!cpus_equal(sd->span, groupmask))
-                        printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+                        printk(KERN_ERR "ERROR: groups don't span "
+                                        "domain->span\n");
                level++;
                sd = sd->parent;
+                if (!sd)
+                        continue;
-                if (sd) {
+                if (!cpus_subset(groupmask, sd->span))
-                        if (!cpus_subset(groupmask, sd->span))
+                        printk(KERN_ERR "ERROR: parent span is not a superset "
-                                printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
+                                "of domain->span\n");
-                }
        } while (sd);
 }
@@ -5528,28 +5625,27 @@ static int __init isolated_cpu_setup(char *str)
 __setup ("isolcpus=", isolated_cpu_setup);
 /*
- * init_sched_build_groups takes an array of groups, the cpumask we wish
+ * init_sched_build_groups takes the cpumask we wish to span, and a pointer
- * to span, and a pointer to a function which identifies what group a CPU
+ * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a valid index into the
+ * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
- * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
+ * (due to the fact that we keep track of groups covered with a cpumask_t).
- * keep track of groups covered with a cpumask_t).
 *
 * init_sched_build_groups will build a circular linked list of the groups
 * covered by the given span, and will set each group's ->cpumask correctly,
 * and ->cpu_power to 0.
 */
 static void
-init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
-                        const cpumask_t *cpu_map,
+                        int (*group_fn)(int cpu, const cpumask_t *cpu_map,
-                        int (*group_fn)(int cpu, const cpumask_t *cpu_map))
+                                        struct sched_group **sg))
 {
        struct sched_group *first = NULL, *last = NULL;
        cpumask_t covered = CPU_MASK_NONE;
        int i;
        for_each_cpu_mask(i, span) {
-                int group = group_fn(i, cpu_map);
+                struct sched_group *sg;
-                struct sched_group *sg = &groups[group];
+                int group = group_fn(i, cpu_map, &sg);
                int j;
                if (cpu_isset(i, covered))
@@ -5559,7 +5655,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span,
                sg->cpu_power = 0;
                for_each_cpu_mask(j, span) {
-                        if (group_fn(j, cpu_map) != group)
+                        if (group_fn(j, cpu_map, NULL) != group)
                                continue;
                        cpu_set(j, covered);
@@ -5733,8 +5829,9 @@ __setup("max_cache_size=", setup_max_cache_size);
 */
 static void touch_cache(void *__cache, unsigned long __size)
 {
-        unsigned long size = __size/sizeof(long), chunk1 = size/3,
+        unsigned long size = __size / sizeof(long);
-                        chunk2 = 2*size/3;
+        unsigned long chunk1 = size / 3;
+        unsigned long chunk2 = 2 * size / 3;
        unsigned long *cache = __cache;
        int i;
@@ -5843,11 +5940,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
         */
        measure_one(cache, size, cpu1, cpu2);
        for (i = 0; i < ITERATIONS; i++)
-                cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
+                cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
        measure_one(cache, size, cpu2, cpu1);
        for (i = 0; i < ITERATIONS; i++)
-                cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
+                cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
        /*
         * (We measure the non-migrating [cached] cost on both
@@ -5857,17 +5954,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
        measure_one(cache, size, cpu1, cpu1);
        for (i = 0; i < ITERATIONS; i++)
-                cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
+                cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
        measure_one(cache, size, cpu2, cpu2);
        for (i = 0; i < ITERATIONS; i++)
-                cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
+                cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
        /*
         * Get the per-iteration migration cost:
         */
-        do_div(cost1, 2*ITERATIONS);
+        do_div(cost1, 2 * ITERATIONS);
-        do_div(cost2, 2*ITERATIONS);
+        do_div(cost2, 2 * ITERATIONS);
        return cost1 - cost2;
 }
@@ -5905,7 +6002,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
         */
        cache = vmalloc(max_size);
        if (!cache) {
-                printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
+                printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
                return 1000000; /* return 1 msec on very small boxen */
        }
@@ -5930,7 +6027,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
                avg_fluct = (avg_fluct + fluct)/2;
                if (migration_debug)
-                        printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
+                        printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
+                                "(%8Ld %8Ld)\n",
                                cpu1, cpu2, size,
                                (long)cost / 1000000,
                                ((long)cost / 100000) % 10,
@@ -6025,20 +6123,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
                        -1
 #endif
                );
-        if (system_state == SYSTEM_BOOTING) {
+        if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
-                if (num_online_cpus() > 1) {
+                printk("migration_cost=");
-                        printk("migration_cost=");
+                for (distance = 0; distance <= max_distance; distance++) {
-                        for (distance = 0; distance <= max_distance; distance++) {
+                        if (distance)
-                                if (distance)
+                                printk(",");
-                                        printk(",");
+                        printk("%ld", (long)migration_cost[distance] / 1000);
-                                printk("%ld", (long)migration_cost[distance] / 1000);
-                        }
-                        printk("\n");
                }
+                printk("\n");
        }
        j1 = jiffies;
        if (migration_debug)
-                printk("migration: %ld seconds\n", (j1-j0)/HZ);
+                printk("migration: %ld seconds\n", (j1-j0) / HZ);
        /*
         * Move back to the original CPU. NUMA-Q gets confused
@@ -6135,10 +6231,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
-static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
+                            struct sched_group **sg)
 {
+        if (sg)
+                *sg = &per_cpu(sched_group_cpus, cpu);
        return cpu;
 }
 #endif
@@ -6148,39 +6247,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
 */
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group sched_group_core[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_core);
 #endif
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
-static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
+                             struct sched_group **sg)
 {
+        int group;
        cpumask_t mask = cpu_sibling_map[cpu];
        cpus_and(mask, mask, *cpu_map);
-        return first_cpu(mask);
+        group = first_cpu(mask);
+        if (sg)
+                *sg = &per_cpu(sched_group_core, group);
+        return group;
 }
 #elif defined(CONFIG_SCHED_MC)
-static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
+                             struct sched_group **sg)
 {
+        if (sg)
+                *sg = &per_cpu(sched_group_core, cpu);
        return cpu;
 }
 #endif
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
-static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
+                             struct sched_group **sg)
 {
+        int group;
 #ifdef CONFIG_SCHED_MC
        cpumask_t mask = cpu_coregroup_map(cpu);
        cpus_and(mask, mask, *cpu_map);
-        return first_cpu(mask);
+        group = first_cpu(mask);
 #elif defined(CONFIG_SCHED_SMT)
        cpumask_t mask = cpu_sibling_map[cpu];
        cpus_and(mask, mask, *cpu_map);
-        return first_cpu(mask);
+        group = first_cpu(mask);
 #else
-        return cpu;
+        group = cpu;
 #endif
+        if (sg)
+                *sg = &per_cpu(sched_group_phys, group);
+        return group;
 }
 #ifdef CONFIG_NUMA
@@ -6193,12 +6305,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
-static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
+                                 struct sched_group **sg)
 {
-        return cpu_to_node(cpu);
+        cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
+        int group;
+        cpus_and(nodemask, nodemask, *cpu_map);
+        group = first_cpu(nodemask);
+        if (sg)
+                *sg = &per_cpu(sched_group_allnodes, group);
+        return group;
 }
 static void init_numa_sched_groups_power(struct sched_group *group_head)
 {
        struct sched_group *sg = group_head;
@@ -6234,16 +6356,9 @@ static void free_sched_groups(const cpumask_t *cpu_map)
        int cpu, i;
        for_each_cpu_mask(cpu, *cpu_map) {
-                struct sched_group *sched_group_allnodes
-                        = sched_group_allnodes_bycpu[cpu];
                struct sched_group **sched_group_nodes
                        = sched_group_nodes_bycpu[cpu];
-                if (sched_group_allnodes) {
-                        kfree(sched_group_allnodes);
-                        sched_group_allnodes_bycpu[cpu] = NULL;
-                }
                if (!sched_group_nodes)
                        continue;
@@ -6337,7 +6452,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
        struct sched_domain *sd;
 #ifdef CONFIG_NUMA
        struct sched_group **sched_group_nodes = NULL;
-        struct sched_group *sched_group_allnodes = NULL;
+        int sd_allnodes = 0;
        /*
         * Allocate the per-node list of sched groups
@@ -6355,7 +6470,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
         * Set up domains for cpus specified by the cpu_map.
         */
        for_each_cpu_mask(i, *cpu_map) {
-                int group;
                struct sched_domain *sd = NULL, *p;
                cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
@@ -6364,26 +6478,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_NUMA
                if (cpus_weight(*cpu_map)
                                > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
-                        if (!sched_group_allnodes) {
-                                sched_group_allnodes
-                                        = kmalloc_node(sizeof(struct sched_group)
-                                                        * MAX_NUMNODES,
-                                                  GFP_KERNEL,
-                                                  cpu_to_node(i));
-                                if (!sched_group_allnodes) {
-                                        printk(KERN_WARNING
-                                        "Can not alloc allnodes sched group\n");
-                                        goto error;
-                                }
-                                sched_group_allnodes_bycpu[i]
-                                                = sched_group_allnodes;
-                        }
                        sd = &per_cpu(allnodes_domains, i);
                        *sd = SD_ALLNODES_INIT;
                        sd->span = *cpu_map;
-                        group = cpu_to_allnodes_group(i, cpu_map);
+                        cpu_to_allnodes_group(i, cpu_map, &sd->groups);
-                        sd->groups = &sched_group_allnodes[group];
                        p = sd;
+                        sd_allnodes = 1;
                } else
                        p = NULL;
@@ -6398,36 +6498,33 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                p = sd;
                sd = &per_cpu(phys_domains, i);
-                group = cpu_to_phys_group(i, cpu_map);
                *sd = SD_CPU_INIT;
                sd->span = nodemask;
                sd->parent = p;
                if (p)
                        p->child = sd;
-                sd->groups = &sched_group_phys[group];
+                cpu_to_phys_group(i, cpu_map, &sd->groups);
 #ifdef CONFIG_SCHED_MC
                p = sd;
                sd = &per_cpu(core_domains, i);
-                group = cpu_to_core_group(i, cpu_map);
                *sd = SD_MC_INIT;
                sd->span = cpu_coregroup_map(i);
                cpus_and(sd->span, sd->span, *cpu_map);
                sd->parent = p;
                p->child = sd;
-                sd->groups = &sched_group_core[group];
+                cpu_to_core_group(i, cpu_map, &sd->groups);
 #endif
 #ifdef CONFIG_SCHED_SMT
                p = sd;
                sd = &per_cpu(cpu_domains, i);
-                group = cpu_to_cpu_group(i, cpu_map);
                *sd = SD_SIBLING_INIT;
                sd->span = cpu_sibling_map[i];
                cpus_and(sd->span, sd->span, *cpu_map);
                sd->parent = p;
                p->child = sd;
-                sd->groups = &sched_group_cpus[group];
+                cpu_to_cpu_group(i, cpu_map, &sd->groups);
 #endif
        }
@@ -6439,8 +6536,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                if (i != first_cpu(this_sibling_map))
                        continue;
-                init_sched_build_groups(sched_group_cpus, this_sibling_map,
+                init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
-                                        cpu_map, &cpu_to_cpu_group);
        }
 #endif
@@ -6451,8 +6547,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                cpus_and(this_core_map, this_core_map, *cpu_map);
                if (i != first_cpu(this_core_map))
                        continue;
-                init_sched_build_groups(sched_group_core, this_core_map,
+                init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
-                                        cpu_map, &cpu_to_core_group);
        }
 #endif
@@ -6465,15 +6560,13 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                if (cpus_empty(nodemask))
                        continue;
-                init_sched_build_groups(sched_group_phys, nodemask,
+                init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
-                                        cpu_map, &cpu_to_phys_group);
        }
 #ifdef CONFIG_NUMA
        /* Set up node groups */
-        if (sched_group_allnodes)
+        if (sd_allnodes)
-                init_sched_build_groups(sched_group_allnodes, *cpu_map,
+                init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
-                                        cpu_map, &cpu_to_allnodes_group);
        for (i = 0; i < MAX_NUMNODES; i++) {
                /* Set up node groups */
@@ -6565,10 +6658,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
        for (i = 0; i < MAX_NUMNODES; i++)
                init_numa_sched_groups_power(sched_group_nodes[i]);
-        if (sched_group_allnodes) {
+        if (sd_allnodes) {
-                int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map);
+                struct sched_group *sg;
-                struct sched_group *sg = &sched_group_allnodes[group];
+                cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
                init_numa_sched_groups_power(sg);
        }
 #endif
@@ -6847,6 +6940,10 @@ void __init sched_init(void)
        set_load_weight(&init_task);
+#ifdef CONFIG_SMP
+        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
+#endif
 #ifdef CONFIG_RT_MUTEXES
        plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
 #endif
@@ -6882,6 +6979,8 @@ void __might_sleep(char *file, int line)
                printk("in_atomic():%d, irqs_disabled():%d\n",
                        in_atomic(), irqs_disabled());
                debug_show_held_locks(current);
+                if (irqs_disabled())
+                        print_irqtrace_events(current);
                dump_stack();
        }
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index ec81defde339..5630255d2e2a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -24,6 +24,9 @@
 #include <linux/signal.h>
 #include <linux/capability.h>
 #include <linux/freezer.h>
+#include <linux/pid_namespace.h>
+#include <linux/nsproxy.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -583,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
        error = -EPERM;
        if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
            && ((sig != SIGCONT) ||
-                (current->signal->session != t->signal->session))
+                (process_session(current) != process_session(t)))
            && (current->euid ^ t->suid) && (current->euid ^ t->uid)
            && (current->uid ^ t->suid) && (current->uid ^ t->uid)
            && !capable(CAP_KILL))
@@ -1702,7 +1705,9 @@ finish_stop(int stop_count)
                read_unlock(&tasklist_lock);
        }
-        schedule();
+        do {
+                schedule();
+        } while (try_to_freeze());
        /*
         * Now we don't run again until continued.
         */
@@ -1877,8 +1882,12 @@ relock:
                if (sig_kernel_ignore(signr)) /* Default is nothing. */
                        continue;
-                /* Init gets no signals it doesn't want.  */
+                /*
-                if (current == child_reaper)
+                 * Init of a pid space gets no signals it doesn't want from
+                 * within that pid space. It can of course get signals from
+                 * its parent pid space.
+                 */
+                if (current == child_reaper(current))
                        continue;
                if (sig_kernel_stop(signr)) {
diff --git a/kernel/sys.c b/kernel/sys.c
index a0c1a29a507f..c7675c1bfdf2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
        if (p->real_parent == group_leader) {
                err = -EPERM;
-                if (p->signal->session != group_leader->signal->session)
+                if (process_session(p) != process_session(group_leader))
                        goto out;
                err = -EACCES;
                if (p->did_exec)
@@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
                goto out;
        if (pgid != pid) {
-                struct task_struct *p;
+                struct task_struct *g =
+                        find_task_by_pid_type(PIDTYPE_PGID, pgid);
-                do_each_task_pid(pgid, PIDTYPE_PGID, p) {
+                if (!g || process_session(g) != process_session(group_leader))
-                        if (p->signal->session == group_leader->signal->session)
+                        goto out;
-                                goto ok_pgid;
-                } while_each_task_pid(pgid, PIDTYPE_PGID, p);
-                goto out;
        }
-ok_pgid:
        err = security_task_setpgid(p, pgid);
        if (err)
                goto out;
@@ -1459,7 +1456,7 @@ asmlinkage long sys_getpgrp(void)
 asmlinkage long sys_getsid(pid_t pid)
 {
        if (!pid)
-                return current->signal->session;
+                return process_session(current);
        else {
                int retval;
                struct task_struct *p;
@@ -1471,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid)
                if (p) {
                        retval = security_task_getsid(p);
                        if (!retval)
-                                retval = p->signal->session;
+                                retval = process_session(p);
                }
                read_unlock(&tasklist_lock);
                return retval;
@@ -1484,7 +1481,6 @@ asmlinkage long sys_setsid(void)
        pid_t session;
        int err = -EPERM;
-        mutex_lock(&tty_mutex);
        write_lock_irq(&tasklist_lock);
        /* Fail if I am already a session leader */
@@ -1504,12 +1500,15 @@ asmlinkage long sys_setsid(void)
        group_leader->signal->leader = 1;
        __set_special_pids(session, session);
+        spin_lock(&group_leader->sighand->siglock);
        group_leader->signal->tty = NULL;
        group_leader->signal->tty_old_pgrp = 0;
+        spin_unlock(&group_leader->sighand->siglock);
        err = process_group(group_leader);
 out:
        write_unlock_irq(&tasklist_lock);
-        mutex_unlock(&tty_mutex);
        return err;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8e9f00fd6d18..600b33358ded 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -65,7 +65,6 @@ extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
 extern int max_threads;
-extern int sysrq_enabled;
 extern int core_uses_pid;
 extern int suid_dumpable;
 extern char core_pattern[];
@@ -92,7 +91,9 @@ extern char modprobe_path[];
 extern int sg_big_buff;
 #endif
 #ifdef CONFIG_SYSVIPC
-static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+                void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
                void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
@@ -131,12 +132,22 @@ extern int max_lock_depth;
 #ifdef CONFIG_SYSCTL_SYSCALL
 static int parse_table(int __user *, int, void __user *, size_t __user *,
-                void __user *, size_t, ctl_table *, void **);
+                void __user *, size_t, ctl_table *);
 #endif
 static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+                  void __user *oldval, size_t __user *oldlenp,
+                  void __user *newval, size_t newlen);
+#ifdef CONFIG_SYSVIPC
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+                  void __user *oldval, size_t __user *oldlenp,
+                  void __user *newval, size_t newlen);
+#endif
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -163,6 +174,40 @@ extern ctl_table inotify_table[];
 int sysctl_legacy_va_layout;
 #endif
+static void *get_uts(ctl_table *table, int write)
+{
+        char *which = table->data;
+#ifdef CONFIG_UTS_NS
+        struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+        which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
+#endif
+        if (!write)
+                down_read(&uts_sem);
+        else
+                down_write(&uts_sem);
+        return which;
+}
+static void put_uts(ctl_table *table, int write, void *which)
+{
+        if (!write)
+                up_read(&uts_sem);
+        else
+                up_write(&uts_sem);
+}
+#ifdef CONFIG_SYSVIPC
+static void *get_ipc(ctl_table *table, int write)
+{
+        char *which = table->data;
+        struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+        which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
+        return which;
+}
+#else
+#define get_ipc(T,W) ((T)->data)
+#endif
 /* /proc declarations: */
 #ifdef CONFIG_PROC_SYSCTL
@@ -229,7 +274,6 @@ static ctl_table root_table[] = {
 };
 static ctl_table kern_table[] = {
-#ifndef CONFIG_UTS_NS
        {
                .ctl_name       = KERN_OSTYPE,
                .procname       = "ostype",
@@ -237,7 +281,7 @@ static ctl_table kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.sysname),
                .mode           = 0444,
                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
+                .strategy       = &sysctl_uts_string,
        },
        {
                .ctl_name       = KERN_OSRELEASE,
@@ -246,7 +290,7 @@ static ctl_table kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.release),
                .mode           = 0444,
                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
+                .strategy       = &sysctl_uts_string,
        },
        {
                .ctl_name       = KERN_VERSION,
@@ -255,7 +299,7 @@ static ctl_table kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.version),
                .mode           = 0444,
                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
+                .strategy       = &sysctl_uts_string,
        },
        {
                .ctl_name       = KERN_NODENAME,
@@ -264,7 +308,7 @@ static ctl_table kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.nodename),
                .mode           = 0644,
                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
+                .strategy       = &sysctl_uts_string,
        },
        {
                .ctl_name       = KERN_DOMAINNAME,
@@ -273,56 +317,8 @@ static ctl_table kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.domainname),
                .mode           = 0644,
                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
+                .strategy       = &sysctl_uts_string,
-        },
-#else  /* !CONFIG_UTS_NS */
-        {
-                .ctl_name       = KERN_OSTYPE,
-                .procname       = "ostype",
-                .data           = NULL,
-                /* could maybe use __NEW_UTS_LEN here? */
-                .maxlen         = FIELD_SIZEOF(struct new_utsname, sysname),
-                .mode           = 0444,
-                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
-        },
-        {
-                .ctl_name       = KERN_OSRELEASE,
-                .procname       = "osrelease",
-                .data           = NULL,
-                .maxlen         = FIELD_SIZEOF(struct new_utsname, release),
-                .mode           = 0444,
-                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
-        },
-        {
-                .ctl_name       = KERN_VERSION,
-                .procname       = "version",
-                .data           = NULL,
-                .maxlen         = FIELD_SIZEOF(struct new_utsname, version),
-                .mode           = 0444,
-                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
-        },
-        {
-                .ctl_name       = KERN_NODENAME,
-                .procname       = "hostname",
-                .data           = NULL,
-                .maxlen         = FIELD_SIZEOF(struct new_utsname, nodename),
-                .mode           = 0644,
-                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
-        },
-        {
-                .ctl_name       = KERN_DOMAINNAME,
-                .procname       = "domainname",
-                .data           = NULL,
-                .maxlen         = FIELD_SIZEOF(struct new_utsname, domainname),
-                .mode           = 0644,
-                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
        },
-#endif /* !CONFIG_UTS_NS */
        {
                .ctl_name       = KERN_PANIC,
                .procname       = "panic",
@@ -481,65 +477,72 @@ static ctl_table kern_table[] = {
        {
                .ctl_name       = KERN_SHMMAX,
                .procname       = "shmmax",
-                .data           = NULL,
+                .data           = &init_ipc_ns.shm_ctlmax,
-                .maxlen         = sizeof (size_t),
+                .maxlen         = sizeof (init_ipc_ns.shm_ctlmax),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_doulongvec_minmax,
+                .strategy       = sysctl_ipc_data,
        },
        {
                .ctl_name       = KERN_SHMALL,
                .procname       = "shmall",
-                .data           = NULL,
+                .data           = &init_ipc_ns.shm_ctlall,
-                .maxlen         = sizeof (size_t),
+                .maxlen         = sizeof (init_ipc_ns.shm_ctlall),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_doulongvec_minmax,
+                .strategy       = sysctl_ipc_data,
        },
        {
                .ctl_name       = KERN_SHMMNI,
                .procname       = "shmmni",
-                .data           = NULL,
+                .data           = &init_ipc_ns.shm_ctlmni,
-                .maxlen         = sizeof (int),
+                .maxlen         = sizeof (init_ipc_ns.shm_ctlmni),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_dointvec,
+                .strategy       = sysctl_ipc_data,
        },
        {
                .ctl_name       = KERN_MSGMAX,
                .procname       = "msgmax",
-                .data           = NULL,
+                .data           = &init_ipc_ns.msg_ctlmax,
-                .maxlen         = sizeof (int),
+                .maxlen         = sizeof (init_ipc_ns.msg_ctlmax),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_dointvec,
+                .strategy       = sysctl_ipc_data,
        },
        {
                .ctl_name       = KERN_MSGMNI,
                .procname       = "msgmni",
-                .data           = NULL,
+                .data           = &init_ipc_ns.msg_ctlmni,
-                .maxlen         = sizeof (int),
+                .maxlen         = sizeof (init_ipc_ns.msg_ctlmni),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_dointvec,
+                .strategy       = sysctl_ipc_data,
        },
        {
                .ctl_name       = KERN_MSGMNB,
                .procname       =  "msgmnb",
-                .data           = NULL,
+                .data           = &init_ipc_ns.msg_ctlmnb,
-                .maxlen         = sizeof (int),
+                .maxlen         = sizeof (init_ipc_ns.msg_ctlmnb),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_dointvec,
+                .strategy       = sysctl_ipc_data,
        },
        {
                .ctl_name       = KERN_SEM,
                .procname       = "sem",
-                .data           = NULL,
+                .data           = &init_ipc_ns.sem_ctls,
                .maxlen         = 4*sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_dointvec,
+                .strategy       = sysctl_ipc_data,
        },
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
        {
                .ctl_name       = KERN_SYSRQ,
                .procname       = "sysrq",
-                .data           = &sysrq_enabled,
+                .data           = &__sysrq_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
@@ -1239,7 +1242,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
        do {
                struct ctl_table_header *head =
                        list_entry(tmp, struct ctl_table_header, ctl_entry);
-                void *context = NULL;
                if (!use_table(head))
                        continue;
@@ -1247,9 +1249,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
                spin_unlock(&sysctl_lock);
                error = parse_table(name, nlen, oldval, oldlenp, 
-                                        newval, newlen, head->ctl_table,
+                                        newval, newlen, head->ctl_table);
-                                        &context);
-                kfree(context);
                spin_lock(&sysctl_lock);
                unuse_table(head);
@@ -1305,7 +1305,7 @@ static inline int ctl_perm(ctl_table *table, int op)
 static int parse_table(int __user *name, int nlen,
                       void __user *oldval, size_t __user *oldlenp,
                       void __user *newval, size_t newlen,
-                       ctl_table *table, void **context)
+                       ctl_table *table)
 {
        int n;
 repeat:
@@ -1325,7 +1325,7 @@ repeat:
                                        error = table->strategy(
                                                table, name, nlen,
                                                oldval, oldlenp,
-                                                newval, newlen, context);
+                                                newval, newlen);
                                        if (error)
                                                return error;
                                }
@@ -1336,7 +1336,7 @@ repeat:
                        }
                        error = do_sysctl_strategy(table, name, nlen,
                                                   oldval, oldlenp,
-                                                   newval, newlen, context);
+                                                   newval, newlen);
                        return error;
                }
        }
@@ -1347,7 +1347,7 @@ repeat:
 int do_sysctl_strategy (ctl_table *table, 
                        int __user *name, int nlen,
                        void __user *oldval, size_t __user *oldlenp,
-                        void __user *newval, size_t newlen, void **context)
+                        void __user *newval, size_t newlen)
 {
        int op = 0, rc;
        size_t len;
@@ -1361,7 +1361,7 @@ int do_sysctl_strategy (ctl_table *table,
        if (table->strategy) {
                rc = table->strategy(table, name, nlen, oldval, oldlenp,
-                                     newval, newlen, context);
+                                     newval, newlen);
                if (rc < 0)
                        return rc;
                if (rc > 0)
@@ -1614,7 +1614,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
                          size_t count, loff_t *ppos)
 {
        int op;
-        struct proc_dir_entry *de = PDE(file->f_dentry->d_inode);
+        struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode);
        struct ctl_table *table;
        size_t res;
        ssize_t error = -ENOTDIR;
@@ -1753,66 +1753,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
 *      Special case of dostring for the UTS structure. This has locks
 *      to observe. Should this be in kernel/sys.c ????
 */
- 
-#ifndef CONFIG_UTS_NS
-static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
-                  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        int r;
-        if (!write) {
-                down_read(&uts_sem);
-                r=proc_dostring(table,0,filp,buffer,lenp, ppos);
-                up_read(&uts_sem);
-        } else {
-                down_write(&uts_sem);
-                r=proc_dostring(table,1,filp,buffer,lenp, ppos);
-                up_write(&uts_sem);
-        }
-        return r;
-}
-#else /* !CONFIG_UTS_NS */
 static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int r;
-        struct uts_namespace* uts_ns = current->nsproxy->uts_ns;
+        void *which;
-        char* which;
+        which = get_uts(table, write);
+        r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos);
-        switch (table->ctl_name) {
+        put_uts(table, write, which);
-        case KERN_OSTYPE:
-                which = uts_ns->name.sysname;
-                break;
-        case KERN_NODENAME:
-                which = uts_ns->name.nodename;
-                break;
-        case KERN_OSRELEASE:
-                which = uts_ns->name.release;
-                break;
-        case KERN_VERSION:
-                which = uts_ns->name.version;
-                break;
-        case KERN_DOMAINNAME:
-                which = uts_ns->name.domainname;
-                break;
-        default:
-                r = -EINVAL;
-                goto out;
-        }
-        if (!write) {
-                down_read(&uts_sem);
-                r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos);
-                up_read(&uts_sem);
-        } else {
-                down_write(&uts_sem);
-                r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos);
-                up_write(&uts_sem);
-        }
- out:
        return r;
 }
-#endif /* !CONFIG_UTS_NS */
 static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
                                 int *valp,
@@ -1976,9 +1927,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp,
 #define OP_SET  0
 #define OP_AND  1
-#define OP_OR   2
-#define OP_MAX  3
-#define OP_MIN  4
 static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
                                      int *valp,
@@ -1990,13 +1938,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
                switch(op) {
                case OP_SET:    *valp = val; break;
                case OP_AND:    *valp &= val; break;
-                case OP_OR:     *valp |= val; break;
-                case OP_MAX:    if(*valp < val)
-                                        *valp = val;
-                                break;
-                case OP_MIN:    if(*valp > val)
-                                *valp = val;
-                                break;
                }
        } else {
                int val = *valp;
@@ -2391,46 +2332,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
 }
 #ifdef CONFIG_SYSVIPC
-static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
-                void __user *buffer, size_t *lenp, loff_t *ppos)
+        void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-        void *data;
+        void *which;
-        struct ipc_namespace *ns;
+        which = get_ipc(table, write);
+        return __do_proc_dointvec(which, table, write, filp, buffer,
-        ns = current->nsproxy->ipc_ns;
-        switch (table->ctl_name) {
-        case KERN_SHMMAX:
-                data = &ns->shm_ctlmax;
-                goto proc_minmax;
-        case KERN_SHMALL:
-                data = &ns->shm_ctlall;
-                goto proc_minmax;
-        case KERN_SHMMNI:
-                data = &ns->shm_ctlmni;
-                break;
-        case KERN_MSGMAX:
-                data = &ns->msg_ctlmax;
-                break;
-        case KERN_MSGMNI:
-                data = &ns->msg_ctlmni;
-                break;
-        case KERN_MSGMNB:
-                data = &ns->msg_ctlmnb;
-                break;
-        case KERN_SEM:
-                data = &ns->sem_ctls;
-                break;
-        default:
-                return -EINVAL;
-        }
-        return __do_proc_dointvec(data, table, write, filp, buffer,
                        lenp, ppos, NULL, NULL);
-proc_minmax:
+}
-        return __do_proc_doulongvec_minmax(data, table, write, filp, buffer,
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
+        struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        void *which;
+        which = get_ipc(table, write);
+        return __do_proc_doulongvec_minmax(which, table, write, filp, buffer,
                        lenp, ppos, 1l, 1l);
 }
 #endif
 static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
@@ -2475,6 +2394,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
 {
        return -ENOSYS;
 }
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+                void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return -ENOSYS;
+}
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
+                struct file *filp, void __user *buffer,
+                size_t *lenp, loff_t *ppos)
+{
+        return -ENOSYS;
+}
 #endif
 int proc_dointvec(ctl_table *table, int write, struct file *filp,
@@ -2539,7 +2469,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
 /* The generic string strategy routine: */
 int sysctl_string(ctl_table *table, int __user *name, int nlen,
                  void __user *oldval, size_t __user *oldlenp,
-                  void __user *newval, size_t newlen, void **context)
+                  void __user *newval, size_t newlen)
 {
        if (!table->data || !table->maxlen) 
                return -ENOTDIR;
@@ -2585,7 +2515,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
 */
 int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen, void **context)
+                void __user *newval, size_t newlen)
 {
        if (newval && newlen) {
@@ -2621,7 +2551,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
 /* Strategy function to convert jiffies to seconds */ 
 int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen, void **context)
+                void __user *newval, size_t newlen)
 {
        if (oldval) {
                size_t olen;
@@ -2649,7 +2579,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
 /* Strategy function to convert jiffies to seconds */ 
 int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen, void **context)
+                void __user *newval, size_t newlen)
 {
        if (oldval) {
                size_t olen;
@@ -2674,6 +2604,64 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
        return 1;
 }
+/* The generic string strategy routine: */
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+                  void __user *oldval, size_t __user *oldlenp,
+                  void __user *newval, size_t newlen)
+{
+        struct ctl_table uts_table;
+        int r, write;
+        write = newval && newlen;
+        memcpy(&uts_table, table, sizeof(uts_table));
+        uts_table.data = get_uts(table, write);
+        r = sysctl_string(&uts_table, name, nlen,
+                oldval, oldlenp, newval, newlen);
+        put_uts(table, write, uts_table.data);
+        return r;
+}
+#ifdef CONFIG_SYSVIPC
+/* The generic sysctl ipc data routine. */
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+                void __user *oldval, size_t __user *oldlenp,
+                void __user *newval, size_t newlen)
+{
+        size_t len;
+        void *data;
+        /* Get out of I don't have a variable */
+        if (!table->data || !table->maxlen)
+                return -ENOTDIR;
+        data = get_ipc(table, 1);
+        if (!data)
+                return -ENOTDIR;
+        if (oldval && oldlenp) {
+                if (get_user(len, oldlenp))
+                        return -EFAULT;
+                if (len) {
+                        if (len > table->maxlen)
+                                len = table->maxlen;
+                        if (copy_to_user(oldval, data, len))
+                                return -EFAULT;
+                        if (put_user(len, oldlenp))
+                                return -EFAULT;
+                }
+        }
+        if (newval && newlen) {
+                if (newlen > table->maxlen)
+                        newlen = table->maxlen;
+                if (copy_from_user(data, newval, newlen))
+                        return -EFAULT;
+        }
+        return 1;
+}
+#endif
 #else /* CONFIG_SYSCTL_SYSCALL */
@@ -2712,32 +2700,44 @@ out:
 int sysctl_string(ctl_table *table, int __user *name, int nlen,
                  void __user *oldval, size_t __user *oldlenp,
-                  void __user *newval, size_t newlen, void **context)
+                  void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
 int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen, void **context)
+                void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
 int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen, void **context)
+                void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
 int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen, void **context)
+                void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+                  void __user *oldval, size_t __user *oldlenp,
+                  void __user *newval, size_t newlen)
+{
+        return -ENOSYS;
+}
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+                void __user *oldval, size_t __user *oldlenp,
+                void __user *newval, size_t newlen)
+{
+        return -ENOSYS;
+}
 #endif /* CONFIG_SYSCTL_SYSCALL */
 /*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 74eca5939bd9..22504afc0d34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c)
        /* check if clocksource is already registered */
        if (is_registered_source(c)) {
                printk("register_clocksource: Cannot register %s. "
-                        "Already registered!", c->name);
+                       "Already registered!", c->name);
                ret = -EBUSY;
        } else {
                /* register it */
@@ -186,6 +186,7 @@ void clocksource_reselect(void)
 }
 EXPORT_SYMBOL(clocksource_reselect);
+#ifdef CONFIG_SYSFS
 /**
 * sysfs_show_current_clocksources - sysfs interface for current clocksource
 * @dev:        unused
@@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
 * Sysfs setup bits:
 */
 static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
-                        sysfs_override_clocksource);
+                   sysfs_override_clocksource);
 static SYSDEV_ATTR(available_clocksource, 0600,
-                        sysfs_show_available_clocksources, NULL);
+                   sysfs_show_available_clocksources, NULL);
 static struct sysdev_class clocksource_sysclass = {
        set_kset_name("clocksource"),
@@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void)
 }
 device_initcall(init_clocksource_sysfs);
+#endif /* CONFIG_SYSFS */
 /**
 * boot_override_clocksource - boot clock override
diff --git a/kernel/timer.c b/kernel/timer.c
index c1c7fbcffec1..feddf817baa5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+        int rem;
+        unsigned long original = j;
+        /*
+         * We don't want all cpus firing their timers at once hitting the
+         * same lock or cachelines, so we skew each extra cpu with an extra
+         * 3 jiffies. This 3 jiffies came originally from the mm/ code which
+         * already did this.
+         * The skew is done by adding 3*cpunr, then round, then subtract this
+         * extra offset again.
+         */
+        j += cpu * 3;
+        rem = j % HZ;
+        /*
+         * If the target jiffie is just after a whole second (which can happen
+         * due to delays of the timer irq, long irq off times etc etc) then
+         * we should round down to the whole second, not up. Use 1/4th second
+         * as cutoff for this rounding as an extreme upper bound for this.
+         */
+        if (rem < HZ/4) /* round down */
+                j = j - rem;
+        else /* round up */
+                j = j - rem + HZ;
+        /* now that we have rounded, subtract the extra skew again */
+        j -= cpu * 3;
+        if (j <= jiffies) /* rounding ate our timeout entirely; */
+                return original;
+        return j;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies);
+/**
+ * __round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies_relative rounds a time delta  in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long __round_jiffies_relative(unsigned long j, int cpu)
+{
+        /*
+         * In theory the following code can skip a jiffy in case jiffies
+         * increments right between the addition and the later subtraction.
+         * However since the entire point of this function is to use approximate
+         * timeouts, it's entirely ok to not handle that.
+         */
+        return  __round_jiffies(j + jiffies, cpu) - jiffies;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_relative);
+/**
+ * round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * round_jiffies rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long round_jiffies(unsigned long j)
+{
+        return __round_jiffies(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies);
+/**
+ * round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * round_jiffies_relative rounds a time delta  in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long round_jiffies_relative(unsigned long j)
+{
+        return __round_jiffies_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_relative);
 static inline void set_running_timer(tvec_base_t *base,
                                        struct timer_list *timer)
 {
@@ -714,7 +846,7 @@ static int change_clocksource(void)
                clock = new;
                clock->cycle_last = now;
                printk(KERN_INFO "Time: %s clocksource has been installed.\n",
-                                        clock->name);
+                       clock->name);
                return 1;
        } else if (clock->update_callback) {
                return clock->update_callback();
@@ -722,7 +854,10 @@ static int change_clocksource(void)
        return 0;
 }
 #else
-#define change_clocksource() (0)
+static inline int change_clocksource(void)
+{
+        return 0;
+}
 #endif
 /**
@@ -820,7 +955,8 @@ device_initcall(timekeeping_init_device);
 * If the error is already larger, we look ahead even further
 * to compensate for late or lost adjustments.
 */
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+                                                 s64 *offset)
 {
        s64 tick_error, i;
        u32 look_ahead, adj;
@@ -844,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *
         * Now calculate the error in (1 << look_ahead) ticks, but first
         * remove the single look ahead already included in the error.
         */
-        tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
+        tick_error = current_tick_length() >>
+                (TICK_LENGTH_SHIFT - clock->shift + 1);
        tick_error -= clock->xtime_interval >> 1;
        error = ((error - tick_error) >> look_ahead) + tick_error;
@@ -896,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
        clock->mult += adj;
        clock->xtime_interval += interval;
        clock->xtime_nsec -= offset;
-        clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
+        clock->error -= (interval - offset) <<
+                        (TICK_LENGTH_SHIFT - clock->shift);
 }
 /**
@@ -1008,11 +1146,15 @@ static inline void calc_load(unsigned long ticks)
        unsigned long active_tasks; /* fixed-point */
        static int count = LOAD_FREQ;
-        active_tasks = count_active_tasks();
+        count -= ticks;
-        for (count -= ticks; count < 0; count += LOAD_FREQ) {
+        if (unlikely(count < 0)) {
-                CALC_LOAD(avenrun[0], EXP_1, active_tasks);
+                active_tasks = count_active_tasks();
-                CALC_LOAD(avenrun[1], EXP_5, active_tasks);
+                do {
-                CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+                        CALC_LOAD(avenrun[0], EXP_1, active_tasks);
+                        CALC_LOAD(avenrun[1], EXP_5, active_tasks);
+                        CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+                        count += LOAD_FREQ;
+                } while (count < 0);
        }
 }
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 96f77013d3f0..baacc3691415 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -96,6 +96,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
        stats->write_char       = p->wchar;
        stats->read_syscalls    = p->syscr;
        stats->write_syscalls   = p->syscw;
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+        stats->read_bytes       = p->ioac.read_bytes;
+        stats->write_bytes      = p->ioac.write_bytes;
+        stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+#else
+        stats->read_bytes       = 0;
+        stats->write_bytes      = 0;
+        stats->cancelled_write_bytes = 0;
+#endif
 }
 #undef KB
 #undef MB
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6b186750e9be..db49886bfae1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -85,22 +85,19 @@ static inline int is_single_threaded(struct workqueue_struct *wq)
        return list_empty(&wq->list);
 }
+/*
+ * Set the workqueue on which a work item is to be run
+ * - Must *only* be called if the pending flag is set
+ */
 static inline void set_wq_data(struct work_struct *work, void *wq)
 {
-        unsigned long new, old, res;
+        unsigned long new;
+        BUG_ON(!work_pending(work));
-        /* assume the pending flag is already set and that the task has already
-         * been queued on this workqueue */
        new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
-        res = work->management;
+        new |= work->management & WORK_STRUCT_FLAG_MASK;
-        if (res != new) {
+        work->management = new;
-                do {
-                        old = res;
-                        new = (unsigned long) wq;
-                        new |= (old & WORK_STRUCT_FLAG_MASK);
-                        res = cmpxchg(&work->management, old, new);
-                } while (res != old);
-        }
 }
 static inline void *get_wq_data(struct work_struct *work)